Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

chunk-based-text-editor / app.py

Em4e

Update app.py

581abcd verified 11 months ago

raw

history blame

26.5 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	from html_to_markdown import convert_to_markdown
	import re
	from llama_index.core.node_parser import MarkdownNodeParser
	from llama_index.core.schema import Document, MetadataMode
	import textstat # For readability metrics

	class WebpageContentProcessor:
	"""
	Handles fetching, converting, and parsing webpage content into structured chunks.
	Adheres to the Single Responsibility Principle (SRP) for content processing.
	"""
	def __init__(self):
	pass

	def fetch_and_convert_to_markdown(self, url: str) -> str:
	"""
	Fetches HTML content from a given URL, attempts to isolate the main content,
	removes common boilerplate, and converts to Markdown.
	Prioritizes semantic content tags over H1-based identification for robust extraction.
	"""
	try:
	response = requests.get(url, timeout=10) # Add a timeout for robustness
	response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
	html_content = response.text
	soup = BeautifulSoup(html_content, 'html.parser')
	# Aggressive initial removal of script, style, and meta tags that are never content.
	for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
	for element in soup.find_all(tag_name):
	element.decompose()

	content_for_conversion = None

	# Prioritize finding main content containers first (semantic tags and common divs)
	content_for_conversion = soup.find('article') or soup.find('main') or \
	soup.find('div', class_='main-content') or \
	soup.find('div', {'role': 'main'})

	# Fallback logic if main content container wasn't found
	if not content_for_conversion:
	first_h1 = soup.find('h1')
	if first_h1:
	candidate_container = first_h1.parent
	found_main_wrapper_via_h1_parent = False
	# Check up to 5 parent levels for a suitable content wrapper
	for _ in range(5):
	if candidate_container is None:
	break
	if candidate_container.name in ['article', 'main', 'section', 'div'] and \
	any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
	candidate_container.get('role') == 'main':
	content_for_conversion = candidate_container
	found_main_wrapper_via_h1_parent = True
	break
	candidate_container = candidate_container.parent

	# If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
	if not found_main_wrapper_via_h1_parent:
	temp_soup = BeautifulSoup('', 'html.parser')
	temp_soup.append(first_h1)
	current_element = first_h1.next_sibling
	while current_element:
	temp_soup.append(current_element)
	current_element = current_element.next_sibling
	content_for_conversion = temp_soup
	else:
	# Ultimate fallback: use the entire body if no specific content tags or H1 found
	content_for_conversion = soup.body

	if not content_for_conversion:
	return "Error: Could not identify main content for conversion."

	# Selective boilerplate removal within the identified main content tag
	unwanted_elements_in_content = [
	'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
	'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
	'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
	'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
	'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
	'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
	'.social-share', '.comments', '.related-posts', '.pagination',
	'.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
	'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
	'[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
	'[class="utility"]', '[class="global-nav"]', '[class="skip"]', '[class="toast"]',
	'[class="announcement"]', '[class="fixed-bottom"]', '[class*="fixed-top"]',
	'[id="promo"]', '[id="ad"]', '[id="banner"]', '[id="popup"]', '[id*="modal"]',
	'[id="overlay"]', '[id="cookie"]', '[id="skip"]', '[id="navbar"]', '[id*="menu"]',
	'.hidden', '.visually-hidden',
	'.no-print', '.print-hide',
	'.wp-block-navigation', '.wp-block-group.is-style-stripes',
	'[class="column"]', '[class="grid"]'
	]

	for selector in unwanted_elements_in_content:
	if re.match(r'^[a-zA-Z0-9]+$', selector):
	for element in content_for_conversion.find_all(selector):
	element.decompose()
	else:
	for element in content_for_conversion.select(selector):
	element.decompose()

	markdown_output = convert_to_markdown(str(content_for_conversion))

	# Post-processing: Clean up resulting Markdown
	markdown_output = re.sub(r'\n\s\n\s\n+', '\n\n', markdown_output)
	markdown_output = re.sub(r'^\s[\\-]\s*$', '', markdown_output, flags=re.MULTILINE)
	markdown_output = re.sub(r'\*{3,}', '', markdown_output)
	markdown_output = markdown_output.strip()

	return markdown_output

	except requests.exceptions.Timeout:
	return "Error: Request timed out. The server took too long to respond."
	except requests.exceptions.RequestException as e:
	return f"Error fetching URL: {e}. Please check the URL or your internet connection."
	except Exception as e:
	return f"An unexpected error occurred during HTML conversion: {e}"

	def parse_markdown_into_chunks(self, markdown_content: str) -> list:
	"""
	Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
	Adheres to SRP for parsing logic.
	"""
	if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
	return []

	doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
	parser = MarkdownNodeParser(include_metadata=True)
	nodes = parser.get_nodes_from_documents([doc])
	print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print

	structured_chunks = []
	current_id = 0

	for node in nodes:
	pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
	heading_title = ""
	content_text = pure_text_content

	heading_match = re.match(r"^(#+)\s(.)", pure_text_content)
	if heading_match:
	heading_title = heading_match.group(2).strip()
	content_text = pure_text_content[len(heading_match.group(0)):].strip()
	if not heading_title:
	heading_title = "[Untitled Section]"
	else:
	first_line = content_text.split('\n')[0].strip()
	heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
	if not heading_title:
	heading_title = "[Empty Section]"
	elif not content_text:
	heading_title = "[Empty Section]"

	structured_chunks.append({
	"id": current_id,
	"title": heading_title,
	"content": content_text,
	"original_node": node # Keep reference to the original LlamaIndex node
	})
	current_id += 1

	return structured_chunks

	class ChunkManager:
	"""
	Manages the collection of content chunks, their statistics, and target settings.
	Adheres to SRP for chunk data management and OCP by allowing new statistics
	or formatting without changing core chunk operations.
	"""
	def __init__(self):
	self._chunks = []
	self.target_flesch_min = 60
	self.target_grade_max = 8
	self.target_min_chunk_words = 50
	self.target_max_chunk_words = 500

	def set_chunks(self, chunks: list):
	"""Sets the internal list of chunks and calculates their initial statistics."""
	self._chunks = []
	for chunk in chunks:
	chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
	self._chunks.append(chunk)

	def get_chunks(self) -> list:
	"""Returns the current list of processed chunks."""
	return self._chunks

	def _calculate_chunk_stats(self, text: str) -> dict:
	"""
	Calculates various linguistic statistics for a given text chunk.
	(Private helper method, SRP for stats calculation)
	"""
	stats = {}
	cleaned_text = re.sub(r'#+\s*', '', text)
	cleaned_text = re.sub(r'[\\-]\s', '', cleaned_text)
	cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()

	stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
	stats['char_count'] = len(cleaned_text)
	stats['sentence_count'] = textstat.sentence_count(cleaned_text)

	if stats['sentence_count'] > 0:
	stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
	else:
	stats['avg_sentence_length'] = 0

	stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0

	try:
	stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
	except Exception:
	stats['flesch_reading_ease'] = 0

	try:
	stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
	except Exception:
	stats['flesch_kincaid_grade'] = 0

	try:
	stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
	except Exception:
	stats['gunning_fog_score'] = 0
	return stats

	def format_chunk_stats(self, stats: dict) -> str:
	"""
	Formats chunk statistics into a readable string, including explanations for readability scores.
	Adheres to SRP for formatting.
	"""
	flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
	kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
	word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"

	stats_str = "#### Chunk Statistics:\n"
	stats_str += f"- Word Count: <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
	stats_str += f"- Character Count: {stats['char_count']}\n"
	stats_str += f"- Sentence Count: {stats['sentence_count']}\n"
	stats_str += f"- Avg Sentence Length: {stats['avg_sentence_length']:.2f} words\n"
	stats_str += f"- Paragraph Count: {stats['paragraph_count']}\n"
	stats_str += f"- Flesch Reading Ease: <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
	stats_str += f"- Flesch-Kincaid Grade: <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
	stats_str += f"- Gunning Fog Score: {stats['gunning_fog_score']:.2f}\n"
	return stats_str

	def get_document_summary_stats(self) -> str:
	"""
	Aggregates statistics for the entire document across all managed chunks.
	Adheres to SRP for document-level summary.
	"""
	if not self._chunks:
	return "No document loaded to generate statistics."

	total_words = 0
	total_chars = 0
	total_sentences = 0
	total_paragraphs = 0

	all_content_text = ""
	for chunk in self._chunks:
	content_text_for_stats = chunk['content']
	# Re-calculate stats for each chunk content to ensure summary is up-to-date
	current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
	total_words += current_chunk_stats['word_count']
	total_chars += current_chunk_stats['char_count']
	total_sentences += current_chunk_stats['sentence_count']
	total_paragraphs += current_chunk_stats['paragraph_count']
	all_content_text += content_text_for_stats + "\n\n"

	doc_stats_str = "## Overall Document Statistics:\n"
	doc_stats_str += f"- Total Chunks: {len(self._chunks)}\n"
	doc_stats_str += f"- Total Words: {total_words}\n"
	doc_stats_str += f"- Total Characters: {total_chars}\n"
	doc_stats_str += f"- Total Sentences: {total_sentences}\n"
	doc_stats_str += f"- Total Paragraphs: {total_paragraphs}\n"

	if len(self._chunks) > 0:
	doc_stats_str += f"- Average Words per Chunk: {total_words / len(self._chunks):.2f}\n"

	if all_content_text.strip():
	overall_stats = self._calculate_chunk_stats(all_content_text)
	doc_stats_str += f"- Overall Flesch Reading Ease: {overall_stats['flesch_reading_ease']:.2f}\n"
	doc_stats_str += f"- Overall Flesch-Kincaid Grade Level: {overall_stats['flesch_kincaid_grade']:.2f}\n"
	doc_stats_str += f"- Overall Gunning Fog Score: {overall_stats['gunning_fog_score']:.2f}\n"
	doc_stats_str += f"- Overall Average Sentence Length: {overall_stats['avg_sentence_length']:.2f} words\n"
	else:
	doc_stats_str += "- No content available for overall readability metrics.\n"
	return doc_stats_str

	def get_chunk_by_id(self, chunk_id: int) -> dict \| None:
	"""Retrieves a chunk by its ID."""
	return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)

	def get_chunk_titles_for_dropdown(self) -> list:
	"""Generates dropdown choices using plain text (no HTML)."""
	dropdown_choices = []
	for chunk in self._chunks:
	title = chunk['title']
	dropdown_choices.append(f"{chunk['id']}: {title}")
	return dropdown_choices

	def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
	"""
	Updates the content of a chunk, recalculates its stats, and updates its title if needed.
	Returns True if successful, False otherwise.
	"""
	for chunk in self._chunks:
	if chunk["id"] == chunk_id:
	chunk["content"] = new_content
	chunk["stats"] = self._calculate_chunk_stats(new_content)
	# Update chunk title if it was a placeholder or empty
	if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
	first_line = new_content.split('\n')[0].strip()
	chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
	if not chunk["title"]:
	chunk["title"] = "[Empty Section]"
	elif not new_content:
	chunk["title"] = "[Empty Section]"
	return True
	return False

	def delete_chunk(self, chunk_id: int) -> bool:
	"""
	Deletes a chunk by ID and re-indexes remaining chunks.
	Returns True if successful, False otherwise.
	"""
	initial_chunk_count = len(self._chunks)
	self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
	if len(self._chunks) == initial_chunk_count:
	return False # Chunk not found
	# Re-index IDs to be sequential again
	for i, chunk in enumerate(self._chunks):
	chunk['id'] = i
	return True

	def get_final_markdown(self) -> str:
	"""Compiles all current chunks into a single Markdown string."""
	final_md = ""
	if not self._chunks:
	return "No content to compile. Please process a URL first."
	for chunk in self._chunks:
	# Use H1 heading if title is meaningful
	if not chunk["title"].startswith("[") and chunk["title"]:
	final_md += f"# {chunk['title']}\n\n"
	final_md += f"{chunk['content']}\n\n"
	return final_md.strip()

	def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
	"""Sets the global readability and word count targets."""
	self.target_flesch_min = flesch_min
	self.target_grade_max = grade_max
	self.target_min_chunk_words = min_words
	self.target_max_chunk_words = max_words
	# Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
	for chunk in self._chunks:
	chunk['stats'] = self._calculate_chunk_stats(chunk['content'])

	# --- Streamlit UI Definition ---
	st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")

	# Initialize session state
	if 'chunk_manager' not in st.session_state:
	st.session_state.chunk_manager = ChunkManager()
	if 'content_processor' not in st.session_state:
	st.session_state.content_processor = WebpageContentProcessor()
	if 'status_message' not in st.session_state:
	st.session_state.status_message = ""
	if 'chunk_selector' not in st.session_state:
	st.session_state.chunk_selector = None
	if 'chunk_content_editor' not in st.session_state:
	st.session_state.chunk_content_editor = ""
	if 'final_markdown' not in st.session_state:
	st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."

	# Instantiate the managers
	content_processor = st.session_state.content_processor
	chunk_manager = st.session_state.chunk_manager

	st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
	st.info(
	"ℹ️ Please Note:\n\n"
	"- Some URLs may be inaccessible due to restrictive server policies (e.g., firewalls or bot detection).\n"
	"- This is an early version of the app, and you may encounter some bugs."
	)
	st.markdown("""Enter a URL, fetch its content, and break it into editable 'chunks'. Review statistics, set targets, edit chunks, and compile your final Markdown.<div style="font-size: 0.9em; margin-bottom: 12px;">
	Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a></div><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
	<span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
	<a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
	<img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
	</a></div><br>""", unsafe_allow_html=True)

	# --- URL Input and Processing ---
	col1, col2 = st.columns([4, 1])
	with col1:
	url_input = st.text_input(
	label="Enter Webpage URL",
	placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
	key="url_input"
	)
	with col2:
	st.write("") # Spacer
	st.write("") # Spacer
	process_button = st.button("Process URL", use_container_width=True)

	if st.session_state.status_message:
	st.info(st.session_state.status_message)

	if process_button:
	if not url_input:
	st.session_state.status_message = "Please enter a URL to process."
	else:
	with st.spinner("Processing URL..."):
	markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)

	if "Error" in markdown_content:
	chunk_manager.set_chunks([])
	st.session_state.status_message = markdown_content
	else:
	chunks = content_processor.parse_markdown_into_chunks(markdown_content)
	chunk_manager.set_chunks(chunks)
	st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."

	if chunks:
	st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
	else:
	st.session_state.chunk_selector = None

	# --- Tabs for Editor and Overview ---
	tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])

	with tab1:
	st.markdown("## Edit Chunks Individually")

	chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
	if chunk_selector_options:
	try:
	# Find the index of the currently selected item to handle updates
	current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
	except (ValueError, TypeError):
	current_selection_index = 0

	selected_chunk_title = st.selectbox(
	label="Select Chunk to Edit",
	options=chunk_selector_options,
	index=current_selection_index,
	key="chunk_selector"
	)
	else:
	selected_chunk_title = st.selectbox(
	label="Select Chunk to Edit",
	options=["No chunks available"],
	disabled=True
	)

	# Get the currently selected chunk
	selected_chunk = None
	if selected_chunk_title and "No chunks available" not in selected_chunk_title:
	current_id = int(selected_chunk_title.split(':')[0].strip())
	selected_chunk = chunk_manager.get_chunk_by_id(current_id)

	if selected_chunk:
	st.text_input(
	label="Chunk Title (Auto-detected)",
	value=selected_chunk["title"],
	disabled=True
	)

	chunk_content_editor = st.text_area(
	label="Chunk Content",
	value=selected_chunk["content"],
	height=250,
	key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
	)

	st.markdown(
	chunk_manager.format_chunk_stats(selected_chunk['stats']),
	unsafe_allow_html=True
	)

	update_col, delete_col, _ = st.columns([1, 1, 3])
	with update_col:
	if st.button("Update Selected Chunk", use_container_width=True):
	chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
	st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
	# Force a re-render to update the dropdown with the new title
	st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"

	with delete_col:
	if st.button("Delete Selected Chunk", use_container_width=True):
	chunk_manager.delete_chunk(selected_chunk['id'])
	st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
	if chunk_manager.get_chunks():
	st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
	else:
	st.session_state.chunk_selector = None

	else:
	st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
	st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
	st.markdown("Chunk statistics will appear here.")

	st.markdown("---")
	st.markdown("## Final Compiled Markdown")
	if st.button("Compile All Chunks", use_container_width=True):
	st.session_state.final_markdown = chunk_manager.get_final_markdown()

	st.text_area(
	label="Compiled Markdown",
	value=st.session_state.final_markdown,
	height=400,
	key="final_markdown_output",
	disabled=False
	)

	with tab2:
	st.markdown("## Document Summary Statistics")
	st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
	st.markdown("---")
	st.markdown("## Content Targets")
	st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")

	with st.form("targets_form"):
	col1, col2 = st.columns(2)
	with col1:
	target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
	target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
	with col2:
	target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
	target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)

	submitted = st.form_submit_button("Set New Targets", use_container_width=True)
	if submitted:
	chunk_manager.set_targets(
	target_flesch_min_input,
	target_grade_max_input,
	int(target_min_chunk_words_input),
	int(target_max_chunk_words_input)
	)
	st.session_state.status_message = "Target settings updated."
	st.rerun()