Em4e commited on
Commit
c7506fd
·
verified ·
1 Parent(s): 581abcd

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -542
app.py CHANGED
@@ -1,542 +1,297 @@
1
- import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from html_to_markdown import convert_to_markdown
5
- import re
6
- from llama_index.core.node_parser import MarkdownNodeParser
7
- from llama_index.core.schema import Document, MetadataMode
8
- import textstat # For readability metrics
9
-
10
- class WebpageContentProcessor:
11
- """
12
- Handles fetching, converting, and parsing webpage content into structured chunks.
13
- Adheres to the Single Responsibility Principle (SRP) for content processing.
14
- """
15
- def __init__(self):
16
- pass
17
-
18
- def fetch_and_convert_to_markdown(self, url: str) -> str:
19
- """
20
- Fetches HTML content from a given URL, attempts to isolate the main content,
21
- removes common boilerplate, and converts to Markdown.
22
- Prioritizes semantic content tags over H1-based identification for robust extraction.
23
- """
24
- try:
25
- response = requests.get(url, timeout=10) # Add a timeout for robustness
26
- response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
- html_content = response.text
28
- soup = BeautifulSoup(html_content, 'html.parser')
29
- # Aggressive initial removal of script, style, and meta tags that are never content.
30
- for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
31
- for element in soup.find_all(tag_name):
32
- element.decompose()
33
-
34
- content_for_conversion = None
35
-
36
- # Prioritize finding main content containers first (semantic tags and common divs)
37
- content_for_conversion = soup.find('article') or soup.find('main') or \
38
- soup.find('div', class_='main-content') or \
39
- soup.find('div', {'role': 'main'})
40
-
41
- # Fallback logic if main content container wasn't found
42
- if not content_for_conversion:
43
- first_h1 = soup.find('h1')
44
- if first_h1:
45
- candidate_container = first_h1.parent
46
- found_main_wrapper_via_h1_parent = False
47
- # Check up to 5 parent levels for a suitable content wrapper
48
- for _ in range(5):
49
- if candidate_container is None:
50
- break
51
- if candidate_container.name in ['article', 'main', 'section', 'div'] and \
52
- any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
53
- candidate_container.get('role') == 'main':
54
- content_for_conversion = candidate_container
55
- found_main_wrapper_via_h1_parent = True
56
- break
57
- candidate_container = candidate_container.parent
58
-
59
- # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
60
- if not found_main_wrapper_via_h1_parent:
61
- temp_soup = BeautifulSoup('', 'html.parser')
62
- temp_soup.append(first_h1)
63
- current_element = first_h1.next_sibling
64
- while current_element:
65
- temp_soup.append(current_element)
66
- current_element = current_element.next_sibling
67
- content_for_conversion = temp_soup
68
- else:
69
- # Ultimate fallback: use the entire body if no specific content tags or H1 found
70
- content_for_conversion = soup.body
71
-
72
- if not content_for_conversion:
73
- return "Error: Could not identify main content for conversion."
74
-
75
- # Selective boilerplate removal within the *identified* main content tag
76
- unwanted_elements_in_content = [
77
- 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
78
- 'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
79
- 'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
80
- 'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
81
- 'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
82
- 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
83
- '.social-share', '.comments', '.related-posts', '.pagination',
84
- '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
85
- '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
86
- '[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
87
- '[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
88
- '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
89
- '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
90
- '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
91
- '.hidden', '.visually-hidden',
92
- '.no-print', '.print-hide',
93
- '.wp-block-navigation', '.wp-block-group.is-style-stripes',
94
- '[class*="column"]', '[class*="grid"]'
95
- ]
96
-
97
- for selector in unwanted_elements_in_content:
98
- if re.match(r'^[a-zA-Z0-9]+$', selector):
99
- for element in content_for_conversion.find_all(selector):
100
- element.decompose()
101
- else:
102
- for element in content_for_conversion.select(selector):
103
- element.decompose()
104
-
105
- markdown_output = convert_to_markdown(str(content_for_conversion))
106
-
107
- # Post-processing: Clean up resulting Markdown
108
- markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
109
- markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
110
- markdown_output = re.sub(r'\*{3,}', '', markdown_output)
111
- markdown_output = markdown_output.strip()
112
-
113
- return markdown_output
114
-
115
- except requests.exceptions.Timeout:
116
- return "Error: Request timed out. The server took too long to respond."
117
- except requests.exceptions.RequestException as e:
118
- return f"Error fetching URL: {e}. Please check the URL or your internet connection."
119
- except Exception as e:
120
- return f"An unexpected error occurred during HTML conversion: {e}"
121
-
122
- def parse_markdown_into_chunks(self, markdown_content: str) -> list:
123
- """
124
- Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
125
- Adheres to SRP for parsing logic.
126
- """
127
- if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
128
- return []
129
-
130
- doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
131
- parser = MarkdownNodeParser(include_metadata=True)
132
- nodes = parser.get_nodes_from_documents([doc])
133
- print(f" Parsed {len(nodes)} nodes from Markdown.") # Debug print
134
-
135
- structured_chunks = []
136
- current_id = 0
137
-
138
- for node in nodes:
139
- pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
140
- heading_title = ""
141
- content_text = pure_text_content
142
-
143
- heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
144
- if heading_match:
145
- heading_title = heading_match.group(2).strip()
146
- content_text = pure_text_content[len(heading_match.group(0)):].strip()
147
- if not heading_title:
148
- heading_title = "[Untitled Section]"
149
- else:
150
- first_line = content_text.split('\n')[0].strip()
151
- heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
152
- if not heading_title:
153
- heading_title = "[Empty Section]"
154
- elif not content_text:
155
- heading_title = "[Empty Section]"
156
-
157
- structured_chunks.append({
158
- "id": current_id,
159
- "title": heading_title,
160
- "content": content_text,
161
- "original_node": node # Keep reference to the original LlamaIndex node
162
- })
163
- current_id += 1
164
-
165
- return structured_chunks
166
-
167
- class ChunkManager:
168
- """
169
- Manages the collection of content chunks, their statistics, and target settings.
170
- Adheres to SRP for chunk data management and OCP by allowing new statistics
171
- or formatting without changing core chunk operations.
172
- """
173
- def __init__(self):
174
- self._chunks = []
175
- self.target_flesch_min = 60
176
- self.target_grade_max = 8
177
- self.target_min_chunk_words = 50
178
- self.target_max_chunk_words = 500
179
-
180
- def set_chunks(self, chunks: list):
181
- """Sets the internal list of chunks and calculates their initial statistics."""
182
- self._chunks = []
183
- for chunk in chunks:
184
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
185
- self._chunks.append(chunk)
186
-
187
- def get_chunks(self) -> list:
188
- """Returns the current list of processed chunks."""
189
- return self._chunks
190
-
191
- def _calculate_chunk_stats(self, text: str) -> dict:
192
- """
193
- Calculates various linguistic statistics for a given text chunk.
194
- (Private helper method, SRP for stats calculation)
195
- """
196
- stats = {}
197
- cleaned_text = re.sub(r'#+\s*', '', text)
198
- cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
199
- cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
200
-
201
- stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
202
- stats['char_count'] = len(cleaned_text)
203
- stats['sentence_count'] = textstat.sentence_count(cleaned_text)
204
-
205
- if stats['sentence_count'] > 0:
206
- stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
207
- else:
208
- stats['avg_sentence_length'] = 0
209
-
210
- stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
211
-
212
- try:
213
- stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
214
- except Exception:
215
- stats['flesch_reading_ease'] = 0
216
-
217
- try:
218
- stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
219
- except Exception:
220
- stats['flesch_kincaid_grade'] = 0
221
-
222
- try:
223
- stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
224
- except Exception:
225
- stats['gunning_fog_score'] = 0
226
- return stats
227
-
228
- def format_chunk_stats(self, stats: dict) -> str:
229
- """
230
- Formats chunk statistics into a readable string, including explanations for readability scores.
231
- Adheres to SRP for formatting.
232
- """
233
- flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
234
- kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
235
- word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
236
-
237
- stats_str = "#### Chunk Statistics:\n"
238
- stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
239
- stats_str += f"- **Character Count:** {stats['char_count']}\n"
240
- stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
241
- stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
242
- stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
243
- stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
244
- stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
245
- stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
246
- return stats_str
247
-
248
- def get_document_summary_stats(self) -> str:
249
- """
250
- Aggregates statistics for the entire document across all managed chunks.
251
- Adheres to SRP for document-level summary.
252
- """
253
- if not self._chunks:
254
- return "No document loaded to generate statistics."
255
-
256
- total_words = 0
257
- total_chars = 0
258
- total_sentences = 0
259
- total_paragraphs = 0
260
-
261
- all_content_text = ""
262
- for chunk in self._chunks:
263
- content_text_for_stats = chunk['content']
264
- # Re-calculate stats for each chunk content to ensure summary is up-to-date
265
- current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
266
- total_words += current_chunk_stats['word_count']
267
- total_chars += current_chunk_stats['char_count']
268
- total_sentences += current_chunk_stats['sentence_count']
269
- total_paragraphs += current_chunk_stats['paragraph_count']
270
- all_content_text += content_text_for_stats + "\n\n"
271
-
272
- doc_stats_str = "## Overall Document Statistics:\n"
273
- doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
274
- doc_stats_str += f"- **Total Words:** {total_words}\n"
275
- doc_stats_str += f"- **Total Characters:** {total_chars}\n"
276
- doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
277
- doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
278
-
279
- if len(self._chunks) > 0:
280
- doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
281
-
282
- if all_content_text.strip():
283
- overall_stats = self._calculate_chunk_stats(all_content_text)
284
- doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
285
- doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
286
- doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
287
- doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
288
- else:
289
- doc_stats_str += "- No content available for overall readability metrics.\n"
290
- return doc_stats_str
291
-
292
- def get_chunk_by_id(self, chunk_id: int) -> dict | None:
293
- """Retrieves a chunk by its ID."""
294
- return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
295
-
296
- def get_chunk_titles_for_dropdown(self) -> list:
297
- """Generates dropdown choices using plain text (no HTML)."""
298
- dropdown_choices = []
299
- for chunk in self._chunks:
300
- title = chunk['title']
301
- dropdown_choices.append(f"{chunk['id']}: {title}")
302
- return dropdown_choices
303
-
304
- def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
305
- """
306
- Updates the content of a chunk, recalculates its stats, and updates its title if needed.
307
- Returns True if successful, False otherwise.
308
- """
309
- for chunk in self._chunks:
310
- if chunk["id"] == chunk_id:
311
- chunk["content"] = new_content
312
- chunk["stats"] = self._calculate_chunk_stats(new_content)
313
- # Update chunk title if it was a placeholder or empty
314
- if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
315
- first_line = new_content.split('\n')[0].strip()
316
- chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
317
- if not chunk["title"]:
318
- chunk["title"] = "[Empty Section]"
319
- elif not new_content:
320
- chunk["title"] = "[Empty Section]"
321
- return True
322
- return False
323
-
324
- def delete_chunk(self, chunk_id: int) -> bool:
325
- """
326
- Deletes a chunk by ID and re-indexes remaining chunks.
327
- Returns True if successful, False otherwise.
328
- """
329
- initial_chunk_count = len(self._chunks)
330
- self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
331
- if len(self._chunks) == initial_chunk_count:
332
- return False # Chunk not found
333
- # Re-index IDs to be sequential again
334
- for i, chunk in enumerate(self._chunks):
335
- chunk['id'] = i
336
- return True
337
-
338
- def get_final_markdown(self) -> str:
339
- """Compiles all current chunks into a single Markdown string."""
340
- final_md = ""
341
- if not self._chunks:
342
- return "No content to compile. Please process a URL first."
343
- for chunk in self._chunks:
344
- # Use H1 heading if title is meaningful
345
- if not chunk["title"].startswith("[") and chunk["title"]:
346
- final_md += f"# {chunk['title']}\n\n"
347
- final_md += f"{chunk['content']}\n\n"
348
- return final_md.strip()
349
-
350
- def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
351
- """Sets the global readability and word count targets."""
352
- self.target_flesch_min = flesch_min
353
- self.target_grade_max = grade_max
354
- self.target_min_chunk_words = min_words
355
- self.target_max_chunk_words = max_words
356
- # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
357
- for chunk in self._chunks:
358
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
359
-
360
- # --- Streamlit UI Definition ---
361
- st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
362
-
363
- # Initialize session state
364
- if 'chunk_manager' not in st.session_state:
365
- st.session_state.chunk_manager = ChunkManager()
366
- if 'content_processor' not in st.session_state:
367
- st.session_state.content_processor = WebpageContentProcessor()
368
- if 'status_message' not in st.session_state:
369
- st.session_state.status_message = ""
370
- if 'chunk_selector' not in st.session_state:
371
- st.session_state.chunk_selector = None
372
- if 'chunk_content_editor' not in st.session_state:
373
- st.session_state.chunk_content_editor = ""
374
- if 'final_markdown' not in st.session_state:
375
- st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
376
-
377
- # Instantiate the managers
378
- content_processor = st.session_state.content_processor
379
- chunk_manager = st.session_state.chunk_manager
380
-
381
- st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
382
- st.info(
383
- "ℹ️ **Please Note:**\n\n"
384
- "- Some URLs may be inaccessible due to restrictive server policies (e.g., firewalls or bot detection).\n"
385
- "- This is an early version of the app, and you may encounter some bugs."
386
- )
387
- st.markdown("""Enter a URL, fetch its content, and break it into editable 'chunks'. Review statistics, set targets, edit chunks, and compile your final Markdown.<div style="font-size: 0.9em; margin-bottom: 12px;">
388
- Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a></div><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
389
- <span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
390
- <a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
391
- <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
392
- </a></div><br>""", unsafe_allow_html=True)
393
-
394
- # --- URL Input and Processing ---
395
- col1, col2 = st.columns([4, 1])
396
- with col1:
397
- url_input = st.text_input(
398
- label="Enter Webpage URL",
399
- placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
400
- key="url_input"
401
- )
402
- with col2:
403
- st.write("") # Spacer
404
- st.write("") # Spacer
405
- process_button = st.button("Process URL", use_container_width=True)
406
-
407
- if st.session_state.status_message:
408
- st.info(st.session_state.status_message)
409
-
410
- if process_button:
411
- if not url_input:
412
- st.session_state.status_message = "Please enter a URL to process."
413
- else:
414
- with st.spinner("Processing URL..."):
415
- markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
416
-
417
- if "Error" in markdown_content:
418
- chunk_manager.set_chunks([])
419
- st.session_state.status_message = markdown_content
420
- else:
421
- chunks = content_processor.parse_markdown_into_chunks(markdown_content)
422
- chunk_manager.set_chunks(chunks)
423
- st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
424
-
425
- if chunks:
426
- st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
427
- else:
428
- st.session_state.chunk_selector = None
429
-
430
- # --- Tabs for Editor and Overview ---
431
- tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
432
-
433
- with tab1:
434
- st.markdown("## Edit Chunks Individually")
435
-
436
- chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
437
- if chunk_selector_options:
438
- try:
439
- # Find the index of the currently selected item to handle updates
440
- current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
441
- except (ValueError, TypeError):
442
- current_selection_index = 0
443
-
444
- selected_chunk_title = st.selectbox(
445
- label="Select Chunk to Edit",
446
- options=chunk_selector_options,
447
- index=current_selection_index,
448
- key="chunk_selector"
449
- )
450
- else:
451
- selected_chunk_title = st.selectbox(
452
- label="Select Chunk to Edit",
453
- options=["No chunks available"],
454
- disabled=True
455
- )
456
-
457
- # Get the currently selected chunk
458
- selected_chunk = None
459
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
460
- current_id = int(selected_chunk_title.split(':')[0].strip())
461
- selected_chunk = chunk_manager.get_chunk_by_id(current_id)
462
-
463
- if selected_chunk:
464
- st.text_input(
465
- label="Chunk Title (Auto-detected)",
466
- value=selected_chunk["title"],
467
- disabled=True
468
- )
469
-
470
- chunk_content_editor = st.text_area(
471
- label="Chunk Content",
472
- value=selected_chunk["content"],
473
- height=250,
474
- key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
475
- )
476
-
477
- st.markdown(
478
- chunk_manager.format_chunk_stats(selected_chunk['stats']),
479
- unsafe_allow_html=True
480
- )
481
-
482
- update_col, delete_col, _ = st.columns([1, 1, 3])
483
- with update_col:
484
- if st.button("Update Selected Chunk", use_container_width=True):
485
- chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
486
- st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
487
- # Force a re-render to update the dropdown with the new title
488
- st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
489
-
490
- with delete_col:
491
- if st.button("Delete Selected Chunk", use_container_width=True):
492
- chunk_manager.delete_chunk(selected_chunk['id'])
493
- st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
494
- if chunk_manager.get_chunks():
495
- st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
496
- else:
497
- st.session_state.chunk_selector = None
498
-
499
- else:
500
- st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
501
- st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
502
- st.markdown("Chunk statistics will appear here.")
503
-
504
- st.markdown("---")
505
- st.markdown("## Final Compiled Markdown")
506
- if st.button("Compile All Chunks", use_container_width=True):
507
- st.session_state.final_markdown = chunk_manager.get_final_markdown()
508
-
509
- st.text_area(
510
- label="Compiled Markdown",
511
- value=st.session_state.final_markdown,
512
- height=400,
513
- key="final_markdown_output",
514
- disabled=False
515
- )
516
-
517
- with tab2:
518
- st.markdown("## Document Summary Statistics")
519
- st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
520
- st.markdown("---")
521
- st.markdown("## Content Targets")
522
- st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
523
-
524
- with st.form("targets_form"):
525
- col1, col2 = st.columns(2)
526
- with col1:
527
- target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
528
- target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
529
- with col2:
530
- target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
531
- target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
532
-
533
- submitted = st.form_submit_button("Set New Targets", use_container_width=True)
534
- if submitted:
535
- chunk_manager.set_targets(
536
- target_flesch_min_input,
537
- target_grade_max_input,
538
- int(target_min_chunk_words_input),
539
- int(target_max_chunk_words_input)
540
- )
541
- st.session_state.status_message = "Target settings updated."
542
- st.rerun()
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from html_to_markdown import convert_to_markdown
5
+ import re
6
+ from llama_index.core.node_parser import MarkdownNodeParser
7
+ from llama_index.core.schema import Document, MetadataMode
8
+ import textstat # For readability metrics
9
+
10
+ class WebpageContentProcessor:
11
+ """
12
+ Handles fetching, converting, and parsing webpage content into structured chunks.
13
+ Adheres to the Single Responsibility Principle (SRP) for content processing.
14
+ """
15
+ def __init__(self):
16
+ pass
17
+
18
+ def fetch_and_convert_to_markdown(self, url: str) -> str:
19
+ """
20
+ Fetches HTML content from a given URL, attempts to isolate the main content,
21
+ removes common boilerplate, and converts to Markdown.
22
+ Prioritizes semantic content tags over H1-based identification for robust extraction.
23
+ """
24
+ try:
25
+ headers = {
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
27
+ }
28
+ response = requests.get(url, headers=headers, timeout=15)
29
+ response.raise_for_status()
30
+ html_content = response.text
31
+ soup = BeautifulSoup(html_content, 'html.parser')
32
+
33
+ for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
34
+ for element in soup.find_all(tag_name):
35
+ element.decompose()
36
+
37
+ content_for_conversion = soup.find('article') or soup.find('main') or \
38
+ soup.find('div', class_='main-content') or \
39
+ soup.find('div', {'role': 'main'})
40
+
41
+ if not content_for_conversion:
42
+ first_h1 = soup.find('h1')
43
+ if first_h1:
44
+ candidate_container = first_h1.parent
45
+ for _ in range(5):
46
+ if candidate_container is None: break
47
+ if candidate_container.name in ['article', 'main', 'section', 'div']:
48
+ content_for_conversion = candidate_container
49
+ break
50
+ candidate_container = candidate_container.parent
51
+ if not content_for_conversion:
52
+ content_for_conversion = first_h1.find_parent()
53
+ else:
54
+ content_for_conversion = soup.body
55
+
56
+ if not content_for_conversion:
57
+ return "Error: Could not identify main content for conversion."
58
+
59
+ unwanted_selectors = [
60
+ 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
61
+ 'textarea', 'svg', 'figure', 'figcaption',
62
+ '.social-share', '.comments', '.related-posts', '.pagination',
63
+ '.breadcrumbs', '.cookie-consent', '[role="navigation"]',
64
+ '[role="banner"]', '[role="contentinfo"]', '[class*="ad"]', '[id*="ad"]'
65
+ ]
66
+ for selector in unwanted_selectors:
67
+ for element in content_for_conversion.select(selector):
68
+ element.decompose()
69
+
70
+ markdown_output = convert_to_markdown(str(content_for_conversion))
71
+ markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
+ markdown_output = markdown_output.strip()
73
+
74
+ return markdown_output
75
+
76
+ except requests.exceptions.Timeout:
77
+ return "Error: Request timed out. The server took too long to respond."
78
+ except requests.exceptions.RequestException as e:
79
+ return f"Error fetching URL: {e}."
80
+ except Exception as e:
81
+ return f"An unexpected error occurred: {e}"
82
+
83
+ def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
+ if not markdown_content or "Error" in markdown_content:
85
+ return []
86
+ doc = Document(text=markdown_content)
87
+ parser = MarkdownNodeParser(include_metadata=True)
88
+ nodes = parser.get_nodes_from_documents([doc])
89
+ structured_chunks = []
90
+ for i, node in enumerate(nodes):
91
+ content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
92
+ title_match = re.match(r"^(#+)\s*(.*)", content)
93
+ title = title_match.group(2).strip() if title_match else (content.split('\n')[0][:70] + "...")
94
+ structured_chunks.append({"id": i, "title": title, "content": content})
95
+ return structured_chunks
96
+
97
+ class ChunkManager:
98
+ def __init__(self):
99
+ self._chunks = []
100
+ self.target_flesch_min = 60
101
+ self.target_grade_max = 8
102
+ self.target_min_chunk_words = 50
103
+ self.target_max_chunk_words = 500
104
+
105
+ def set_chunks(self, chunks: list):
106
+ self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
107
+
108
+ def get_chunks(self) -> list:
109
+ return self._chunks
110
+
111
+ def _add_stats_to_chunk(self, chunk: dict) -> dict:
112
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
113
+ return chunk
114
+
115
+ def _calculate_chunk_stats(self, text: str) -> dict:
116
+ stats = {}
117
+ try:
118
+ stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
119
+ stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
120
+ stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
121
+ except Exception:
122
+ stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
123
+ return stats
124
+
125
+ def format_chunk_stats(self, stats: dict) -> str:
126
+ flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
127
+ grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
128
+ word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
129
+
130
+ return (
131
+ f"**Word Count:** <span style='color:{word_color}'>{stats.get('word_count', 0)}</span> | "
132
+ f"**Reading Ease:** <span style='color:{flesch_color}'>{stats.get('flesch_reading_ease', 0):.2f}</span> | "
133
+ f"**Grade Level:** <span style='color:{grade_color}'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
134
+ )
135
+
136
+ def get_document_summary_stats(self) -> str:
137
+ if not self._chunks:
138
+ return "No document loaded."
139
+
140
+ total_words = sum(c['stats']['word_count'] for c in self._chunks)
141
+ avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
142
+ avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
143
+
144
+ return (
145
+ f"**Total Chunks:** {len(self._chunks)} | "
146
+ f"**Total Words:** {total_words} | "
147
+ f"**Avg. Reading Ease:** {avg_ease:.2f} | "
148
+ f"**Avg. Grade Level:** {avg_grade:.2f}"
149
+ )
150
+
151
+ def get_chunk_by_id(self, chunk_id: int) -> dict | None:
152
+ return next((c for c in self._chunks if c["id"] == chunk_id), None)
153
+
154
+ def update_chunk_content(self, chunk_id: int, new_content: str):
155
+ chunk = self.get_chunk_by_id(chunk_id)
156
+ if chunk:
157
+ chunk["content"] = new_content
158
+ self._add_stats_to_chunk(chunk)
159
+
160
+ def delete_chunk(self, chunk_id: int):
161
+ self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
162
+ for i, chunk in enumerate(self._chunks):
163
+ chunk['id'] = i
164
+
165
+ def get_final_markdown(self) -> str:
166
+ if not self._chunks:
167
+ return "No content to display."
168
+ return "\n\n".join(f"# {c['title']}\n{c['content']}" for c in self._chunks)
169
+
170
+ def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
171
+ self.target_flesch_min = flesch_min
172
+ self.target_grade_max = grade_max
173
+ self.target_min_chunk_words = min_words
174
+ self.target_max_chunk_words = max_words
175
+ self.set_chunks(self.get_chunks()) # Recalculate stats with new targets
176
+
177
+ st.set_page_config(layout="wide", page_title="Webpage Content Editor")
178
+
179
+ # Initialize session state variables
180
+ if 'chunk_manager' not in st.session_state:
181
+ st.session_state.chunk_manager = ChunkManager()
182
+ if 'content_processor' not in st.session_state:
183
+ st.session_state.content_processor = WebpageContentProcessor()
184
+ if 'selected_chunk_id' not in st.session_state:
185
+ st.session_state.selected_chunk_id = None
186
+ if 'status_message' not in st.session_state:
187
+ st.session_state.status_message = ""
188
+
189
+ processor = st.session_state.content_processor
190
+ manager = st.session_state.chunk_manager
191
+
192
+ st.title("✨ Webpage Content Editor")
193
+ st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
194
+
195
+ st.info(
196
+ "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
197
+ "This is an early version, so expect a few bugs!",
198
+ icon="ℹ️"
199
+ )
200
+
201
+ url_input = st.text_input("Enter a webpage URL to begin", key="url_input")
202
+
203
+ if st.button("Process URL", use_container_width=True):
204
+ if url_input:
205
+ with st.spinner("Fetching and processing content..."):
206
+ markdown = processor.fetch_and_convert_to_markdown(url_input)
207
+ if "Error" in markdown:
208
+ st.session_state.status_message = markdown
209
+ manager.set_chunks([])
210
+ else:
211
+ chunks = processor.parse_markdown_into_chunks(markdown)
212
+ manager.set_chunks(chunks)
213
+ st.session_state.status_message = f"Successfully processed {len(chunks)} chunks." if chunks else "Could not extract content chunks."
214
+
215
+ if manager.get_chunks():
216
+ st.session_state.selected_chunk_id = manager.get_chunks()[0]['id']
217
+ else:
218
+ st.session_state.selected_chunk_id = None
219
+ st.rerun()
220
+
221
+ if st.session_state.status_message:
222
+ st.toast(st.session_state.status_message)
223
+ st.session_state.status_message = "" # Clear message after showing
224
+
225
+ tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
226
+
227
+ with tab1:
228
+ chunks = manager.get_chunks()
229
+ if not chunks:
230
+ st.write("Process a URL to start editing chunks.")
231
+ else:
232
+ # Ensure selected_chunk_id is valid
233
+ if st.session_state.selected_chunk_id not in [c['id'] for c in chunks]:
234
+ st.session_state.selected_chunk_id = chunks[0]['id'] if chunks else None
235
+
236
+ if st.session_state.selected_chunk_id is not None:
237
+ chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
238
+
239
+ def on_select_change():
240
+ # Callback to update the selected ID in session state
241
+ st.session_state.selected_chunk_id = st.session_state.chunk_selector
242
+
243
+ selected_id_from_widget = st.selectbox(
244
+ "Select a chunk to edit",
245
+ options=list(chunk_options.keys()),
246
+ format_func=lambda x: chunk_options[x],
247
+ key="chunk_selector",
248
+ on_change=on_select_change,
249
+ index=list(chunk_options.keys()).index(st.session_state.selected_chunk_id)
250
+ )
251
+
252
+ selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
253
+
254
+ if selected_chunk:
255
+ st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
256
+
257
+ edited_content = st.text_area(
258
+ "Chunk Content",
259
+ value=selected_chunk['content'],
260
+ height=300,
261
+ key=f"editor_{selected_chunk['id']}" # Unique key forces re-render
262
+ )
263
+
264
+ col1, col2, _ = st.columns([1, 1, 4])
265
+ if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
266
+ manager.update_chunk_content(selected_chunk['id'], edited_content)
267
+ st.session_state.status_message = "Chunk updated!"
268
+ st.rerun()
269
+
270
+ if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
271
+ old_id = selected_chunk['id']
272
+ manager.delete_chunk(old_id)
273
+ st.session_state.status_message = "Chunk deleted!"
274
+ # Select the next available chunk or none if empty
275
+ remaining_chunks = manager.get_chunks()
276
+ st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
277
+ st.rerun()
278
+
279
+ with tab2:
280
+ st.subheader("Document Overview")
281
+ st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
282
+
283
+ st.subheader("Content Targets")
284
+ with st.form("targets_form"):
285
+ c1, c2 = st.columns(2)
286
+ f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
287
+ g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
288
+ w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
289
+ w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
290
+
291
+ if st.form_submit_button("Set New Targets", use_container_width=True):
292
+ manager.set_targets(f_min, g_max, w_min, w_max)
293
+ st.session_state.status_message = "Targets updated."
294
+ st.rerun()
295
+
296
+ st.subheader("Final Document")
297
+ st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")