Código python para resumir por tareas analizando cadenas de contenido de un articulo exportado a HTML.

El programa asume que los títulos y subtítulos estan marcados con etiquetas h1, h2, etc.

from bs4 import BeautifulSoup

# Load the HTML content
file_path = '/mnt/data/micontenido.html'
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract and summarize the topics
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
topics = []

for heading in headings:
topic = heading.get_text(strip=True)
topics.append(topic)

# Create a summary dictionary
summary = {f"Topic {index+1}": topic for index, topic in enumerate(topics)}

import pandas as pd

# Convert summary to DataFrame for better visualization
summary_df = pd.DataFrame(list(summary.items()), columns=["Topic Number", "Description"])

import ace_tools as tools; tools.display_dataframe_to_user(name="Summary of HTML Topics", dataframe=summary_df)

summary_df