Em4e's picture
Update app.py
581abcd verified
raw
history blame
26.5 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
from html_to_markdown import convert_to_markdown
import re
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.schema import Document, MetadataMode
import textstat # For readability metrics
class WebpageContentProcessor:
"""
Handles fetching, converting, and parsing webpage content into structured chunks.
Adheres to the Single Responsibility Principle (SRP) for content processing.
"""
def __init__(self):
pass
def fetch_and_convert_to_markdown(self, url: str) -> str:
"""
Fetches HTML content from a given URL, attempts to isolate the main content,
removes common boilerplate, and converts to Markdown.
Prioritizes semantic content tags over H1-based identification for robust extraction.
"""
try:
response = requests.get(url, timeout=10) # Add a timeout for robustness
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
# Aggressive initial removal of script, style, and meta tags that are never content.
for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
for element in soup.find_all(tag_name):
element.decompose()
content_for_conversion = None
# Prioritize finding main content containers first (semantic tags and common divs)
content_for_conversion = soup.find('article') or soup.find('main') or \
soup.find('div', class_='main-content') or \
soup.find('div', {'role': 'main'})
# Fallback logic if main content container wasn't found
if not content_for_conversion:
first_h1 = soup.find('h1')
if first_h1:
candidate_container = first_h1.parent
found_main_wrapper_via_h1_parent = False
# Check up to 5 parent levels for a suitable content wrapper
for _ in range(5):
if candidate_container is None:
break
if candidate_container.name in ['article', 'main', 'section', 'div'] and \
any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
candidate_container.get('role') == 'main':
content_for_conversion = candidate_container
found_main_wrapper_via_h1_parent = True
break
candidate_container = candidate_container.parent
# If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
if not found_main_wrapper_via_h1_parent:
temp_soup = BeautifulSoup('', 'html.parser')
temp_soup.append(first_h1)
current_element = first_h1.next_sibling
while current_element:
temp_soup.append(current_element)
current_element = current_element.next_sibling
content_for_conversion = temp_soup
else:
# Ultimate fallback: use the entire body if no specific content tags or H1 found
content_for_conversion = soup.body
if not content_for_conversion:
return "Error: Could not identify main content for conversion."
# Selective boilerplate removal within the *identified* main content tag
unwanted_elements_in_content = [
'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
'.social-share', '.comments', '.related-posts', '.pagination',
'.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
'[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
'[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
'[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
'[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
'[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
'.hidden', '.visually-hidden',
'.no-print', '.print-hide',
'.wp-block-navigation', '.wp-block-group.is-style-stripes',
'[class*="column"]', '[class*="grid"]'
]
for selector in unwanted_elements_in_content:
if re.match(r'^[a-zA-Z0-9]+$', selector):
for element in content_for_conversion.find_all(selector):
element.decompose()
else:
for element in content_for_conversion.select(selector):
element.decompose()
markdown_output = convert_to_markdown(str(content_for_conversion))
# Post-processing: Clean up resulting Markdown
markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
markdown_output = re.sub(r'\*{3,}', '', markdown_output)
markdown_output = markdown_output.strip()
return markdown_output
except requests.exceptions.Timeout:
return "Error: Request timed out. The server took too long to respond."
except requests.exceptions.RequestException as e:
return f"Error fetching URL: {e}. Please check the URL or your internet connection."
except Exception as e:
return f"An unexpected error occurred during HTML conversion: {e}"
def parse_markdown_into_chunks(self, markdown_content: str) -> list:
"""
Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
Adheres to SRP for parsing logic.
"""
if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
return []
doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
parser = MarkdownNodeParser(include_metadata=True)
nodes = parser.get_nodes_from_documents([doc])
print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
structured_chunks = []
current_id = 0
for node in nodes:
pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
heading_title = ""
content_text = pure_text_content
heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
if heading_match:
heading_title = heading_match.group(2).strip()
content_text = pure_text_content[len(heading_match.group(0)):].strip()
if not heading_title:
heading_title = "[Untitled Section]"
else:
first_line = content_text.split('\n')[0].strip()
heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
if not heading_title:
heading_title = "[Empty Section]"
elif not content_text:
heading_title = "[Empty Section]"
structured_chunks.append({
"id": current_id,
"title": heading_title,
"content": content_text,
"original_node": node # Keep reference to the original LlamaIndex node
})
current_id += 1
return structured_chunks
class ChunkManager:
"""
Manages the collection of content chunks, their statistics, and target settings.
Adheres to SRP for chunk data management and OCP by allowing new statistics
or formatting without changing core chunk operations.
"""
def __init__(self):
self._chunks = []
self.target_flesch_min = 60
self.target_grade_max = 8
self.target_min_chunk_words = 50
self.target_max_chunk_words = 500
def set_chunks(self, chunks: list):
"""Sets the internal list of chunks and calculates their initial statistics."""
self._chunks = []
for chunk in chunks:
chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
self._chunks.append(chunk)
def get_chunks(self) -> list:
"""Returns the current list of processed chunks."""
return self._chunks
def _calculate_chunk_stats(self, text: str) -> dict:
"""
Calculates various linguistic statistics for a given text chunk.
(Private helper method, SRP for stats calculation)
"""
stats = {}
cleaned_text = re.sub(r'#+\s*', '', text)
cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
stats['char_count'] = len(cleaned_text)
stats['sentence_count'] = textstat.sentence_count(cleaned_text)
if stats['sentence_count'] > 0:
stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
else:
stats['avg_sentence_length'] = 0
stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
try:
stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
except Exception:
stats['flesch_reading_ease'] = 0
try:
stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
except Exception:
stats['flesch_kincaid_grade'] = 0
try:
stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
except Exception:
stats['gunning_fog_score'] = 0
return stats
def format_chunk_stats(self, stats: dict) -> str:
"""
Formats chunk statistics into a readable string, including explanations for readability scores.
Adheres to SRP for formatting.
"""
flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
stats_str = "#### Chunk Statistics:\n"
stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
stats_str += f"- **Character Count:** {stats['char_count']}\n"
stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
return stats_str
def get_document_summary_stats(self) -> str:
"""
Aggregates statistics for the entire document across all managed chunks.
Adheres to SRP for document-level summary.
"""
if not self._chunks:
return "No document loaded to generate statistics."
total_words = 0
total_chars = 0
total_sentences = 0
total_paragraphs = 0
all_content_text = ""
for chunk in self._chunks:
content_text_for_stats = chunk['content']
# Re-calculate stats for each chunk content to ensure summary is up-to-date
current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
total_words += current_chunk_stats['word_count']
total_chars += current_chunk_stats['char_count']
total_sentences += current_chunk_stats['sentence_count']
total_paragraphs += current_chunk_stats['paragraph_count']
all_content_text += content_text_for_stats + "\n\n"
doc_stats_str = "## Overall Document Statistics:\n"
doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
doc_stats_str += f"- **Total Words:** {total_words}\n"
doc_stats_str += f"- **Total Characters:** {total_chars}\n"
doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
if len(self._chunks) > 0:
doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
if all_content_text.strip():
overall_stats = self._calculate_chunk_stats(all_content_text)
doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
else:
doc_stats_str += "- No content available for overall readability metrics.\n"
return doc_stats_str
def get_chunk_by_id(self, chunk_id: int) -> dict | None:
"""Retrieves a chunk by its ID."""
return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
def get_chunk_titles_for_dropdown(self) -> list:
"""Generates dropdown choices using plain text (no HTML)."""
dropdown_choices = []
for chunk in self._chunks:
title = chunk['title']
dropdown_choices.append(f"{chunk['id']}: {title}")
return dropdown_choices
def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
"""
Updates the content of a chunk, recalculates its stats, and updates its title if needed.
Returns True if successful, False otherwise.
"""
for chunk in self._chunks:
if chunk["id"] == chunk_id:
chunk["content"] = new_content
chunk["stats"] = self._calculate_chunk_stats(new_content)
# Update chunk title if it was a placeholder or empty
if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
first_line = new_content.split('\n')[0].strip()
chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
if not chunk["title"]:
chunk["title"] = "[Empty Section]"
elif not new_content:
chunk["title"] = "[Empty Section]"
return True
return False
def delete_chunk(self, chunk_id: int) -> bool:
"""
Deletes a chunk by ID and re-indexes remaining chunks.
Returns True if successful, False otherwise.
"""
initial_chunk_count = len(self._chunks)
self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
if len(self._chunks) == initial_chunk_count:
return False # Chunk not found
# Re-index IDs to be sequential again
for i, chunk in enumerate(self._chunks):
chunk['id'] = i
return True
def get_final_markdown(self) -> str:
"""Compiles all current chunks into a single Markdown string."""
final_md = ""
if not self._chunks:
return "No content to compile. Please process a URL first."
for chunk in self._chunks:
# Use H1 heading if title is meaningful
if not chunk["title"].startswith("[") and chunk["title"]:
final_md += f"# {chunk['title']}\n\n"
final_md += f"{chunk['content']}\n\n"
return final_md.strip()
def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
"""Sets the global readability and word count targets."""
self.target_flesch_min = flesch_min
self.target_grade_max = grade_max
self.target_min_chunk_words = min_words
self.target_max_chunk_words = max_words
# Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
for chunk in self._chunks:
chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
# --- Streamlit UI Definition ---
st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
# Initialize session state
if 'chunk_manager' not in st.session_state:
st.session_state.chunk_manager = ChunkManager()
if 'content_processor' not in st.session_state:
st.session_state.content_processor = WebpageContentProcessor()
if 'status_message' not in st.session_state:
st.session_state.status_message = ""
if 'chunk_selector' not in st.session_state:
st.session_state.chunk_selector = None
if 'chunk_content_editor' not in st.session_state:
st.session_state.chunk_content_editor = ""
if 'final_markdown' not in st.session_state:
st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
# Instantiate the managers
content_processor = st.session_state.content_processor
chunk_manager = st.session_state.chunk_manager
st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
st.info(
"ℹ️ **Please Note:**\n\n"
"- Some URLs may be inaccessible due to restrictive server policies (e.g., firewalls or bot detection).\n"
"- This is an early version of the app, and you may encounter some bugs."
)
st.markdown("""Enter a URL, fetch its content, and break it into editable 'chunks'. Review statistics, set targets, edit chunks, and compile your final Markdown.<div style="font-size: 0.9em; margin-bottom: 12px;">
Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a></div><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
<span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
<a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
<img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
</a></div><br>""", unsafe_allow_html=True)
# --- URL Input and Processing ---
col1, col2 = st.columns([4, 1])
with col1:
url_input = st.text_input(
label="Enter Webpage URL",
placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
key="url_input"
)
with col2:
st.write("") # Spacer
st.write("") # Spacer
process_button = st.button("Process URL", use_container_width=True)
if st.session_state.status_message:
st.info(st.session_state.status_message)
if process_button:
if not url_input:
st.session_state.status_message = "Please enter a URL to process."
else:
with st.spinner("Processing URL..."):
markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
if "Error" in markdown_content:
chunk_manager.set_chunks([])
st.session_state.status_message = markdown_content
else:
chunks = content_processor.parse_markdown_into_chunks(markdown_content)
chunk_manager.set_chunks(chunks)
st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
if chunks:
st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
else:
st.session_state.chunk_selector = None
# --- Tabs for Editor and Overview ---
tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
with tab1:
st.markdown("## Edit Chunks Individually")
chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
if chunk_selector_options:
try:
# Find the index of the currently selected item to handle updates
current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
except (ValueError, TypeError):
current_selection_index = 0
selected_chunk_title = st.selectbox(
label="Select Chunk to Edit",
options=chunk_selector_options,
index=current_selection_index,
key="chunk_selector"
)
else:
selected_chunk_title = st.selectbox(
label="Select Chunk to Edit",
options=["No chunks available"],
disabled=True
)
# Get the currently selected chunk
selected_chunk = None
if selected_chunk_title and "No chunks available" not in selected_chunk_title:
current_id = int(selected_chunk_title.split(':')[0].strip())
selected_chunk = chunk_manager.get_chunk_by_id(current_id)
if selected_chunk:
st.text_input(
label="Chunk Title (Auto-detected)",
value=selected_chunk["title"],
disabled=True
)
chunk_content_editor = st.text_area(
label="Chunk Content",
value=selected_chunk["content"],
height=250,
key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
)
st.markdown(
chunk_manager.format_chunk_stats(selected_chunk['stats']),
unsafe_allow_html=True
)
update_col, delete_col, _ = st.columns([1, 1, 3])
with update_col:
if st.button("Update Selected Chunk", use_container_width=True):
chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
# Force a re-render to update the dropdown with the new title
st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
with delete_col:
if st.button("Delete Selected Chunk", use_container_width=True):
chunk_manager.delete_chunk(selected_chunk['id'])
st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
if chunk_manager.get_chunks():
st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
else:
st.session_state.chunk_selector = None
else:
st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
st.markdown("Chunk statistics will appear here.")
st.markdown("---")
st.markdown("## Final Compiled Markdown")
if st.button("Compile All Chunks", use_container_width=True):
st.session_state.final_markdown = chunk_manager.get_final_markdown()
st.text_area(
label="Compiled Markdown",
value=st.session_state.final_markdown,
height=400,
key="final_markdown_output",
disabled=False
)
with tab2:
st.markdown("## Document Summary Statistics")
st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
st.markdown("---")
st.markdown("## Content Targets")
st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
with st.form("targets_form"):
col1, col2 = st.columns(2)
with col1:
target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
with col2:
target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
submitted = st.form_submit_button("Set New Targets", use_container_width=True)
if submitted:
chunk_manager.set_targets(
target_flesch_min_input,
target_grade_max_input,
int(target_min_chunk_words_input),
int(target_max_chunk_words_input)
)
st.session_state.status_message = "Target settings updated."
st.rerun()