Em4e commited on
Commit
4d98418
·
verified ·
1 Parent(s): fc54d8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -6
app.py CHANGED
@@ -3,7 +3,7 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from html_to_markdown import convert_to_markdown
5
  import re
6
- from llama_index.core.node_parser import MarkdownNodeParser
7
  from llama_index.core.schema import Document, MetadataMode
8
  import textstat # For readability metrics
9
 
@@ -81,19 +81,41 @@ class WebpageContentProcessor:
81
  return f"An unexpected error occurred: {e}"
82
 
83
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
 
 
 
 
84
  if not markdown_content or "Error" in markdown_content:
85
  return []
 
 
 
 
 
 
 
 
86
  doc = Document(text=markdown_content)
87
- parser = MarkdownNodeParser(include_metadata=True)
88
  nodes = parser.get_nodes_from_documents([doc])
 
89
  structured_chunks = []
90
  for i, node in enumerate(nodes):
91
  content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
92
  if not content:
93
  continue
94
- title_match = re.match(r"^(#+)\s*(.*)", content)
95
- title = title_match.group(2).strip() if title_match and title_match.group(2).strip() else (content.split('\n')[0][:70] + "...")
96
- structured_chunks.append({"id": i, "title": title, "content": content})
 
 
 
 
 
 
 
 
 
 
97
  return structured_chunks
98
 
99
  class ChunkManager:
@@ -306,4 +328,4 @@ with tab2:
306
  st.rerun()
307
 
308
  st.subheader("Final Document")
309
- st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")
 
3
  from bs4 import BeautifulSoup
4
  from html_to_markdown import convert_to_markdown
5
  import re
6
+ from llama_index.core.node_parser import SentenceSplitter
7
  from llama_index.core.schema import Document, MetadataMode
8
  import textstat # For readability metrics
9
 
 
81
  return f"An unexpected error occurred: {e}"
82
 
83
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
+ """
85
+ Parses markdown content into chunks using a sentence splitter for more
86
+ reliable chunking than header-based splitting.
87
+ """
88
  if not markdown_content or "Error" in markdown_content:
89
  return []
90
+
91
+ # Use SentenceSplitter for more reliable chunking based on size,
92
+ # rather than relying on Markdown headers which may not exist.
93
+ parser = SentenceSplitter(
94
+ chunk_size=2000, # Characters per chunk
95
+ chunk_overlap=200 # Characters to overlap between chunks
96
+ )
97
+
98
  doc = Document(text=markdown_content)
 
99
  nodes = parser.get_nodes_from_documents([doc])
100
+
101
  structured_chunks = []
102
  for i, node in enumerate(nodes):
103
  content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
104
  if not content:
105
  continue
106
+
107
+ # Generate a title from the first non-empty line of the chunk
108
+ first_line = next((line for line in content.split('\n') if line.strip()), "")
109
+ title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
110
+ if not title:
111
+ title = f"Chunk {i+1}"
112
+
113
+ structured_chunks.append({
114
+ "id": i,
115
+ "title": title,
116
+ "content": content
117
+ })
118
+
119
  return structured_chunks
120
 
121
  class ChunkManager:
 
328
  st.rerun()
329
 
330
  st.subheader("Final Document")
331
+ st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")