Código python para resumir por tareas analizando cadenas de contenido de un articulo exportado a HTML.
El programa asume que los títulos y subtítulos estan marcados con etiquetas h1, h2, etc.
from bs4 import BeautifulSoup
# Load the HTML content
file_path = '/mnt/data/micontenido.html'
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract and summarize the topics
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
topics = []
for heading in headings:
topic = heading.get_text(strip=True)
topics.append(topic)
# Create a summary dictionary
summary = {f"Topic {index+1}": topic for index, topic in enumerate(topics)}
import pandas as pd
# Convert summary to DataFrame for better visualization
summary_df = pd.DataFrame(list(summary.items()), columns=["Topic Number", "Description"])
import ace_tools as tools; tools.display_dataframe_to_user(name="Summary of HTML Topics", dataframe=summary_df)
summary_df