Em4e commited on
Commit
54f6925
·
verified ·
1 Parent(s): e69dbdc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
- from markitdown import MarkItDown # <-- MODIFIED: Corrected class name casing
9
 
10
  # --- Core Logic Classes ---
11
 
@@ -20,7 +20,7 @@ class WebpageContentProcessor:
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
  Fetches HTML content, removes common boilerplate tags from the entire page,
23
- and then converts the remaining body content to Markdown using MarkItDown.
24
  """
25
  try:
26
  headers = {
@@ -42,11 +42,9 @@ class WebpageContentProcessor:
42
  if not content_container:
43
  return "Error: Could not find the <body> of the webpage."
44
 
45
- # --- MODIFIED: Corrected MarkItDown usage for the installed library version ---
46
- # 1. Instantiate the converter object.
47
- markdown_converter_instance = MarkItDown()
48
- # 2. Call the .convert() method with the HTML content.
49
- markdown_output = markdown_converter_instance.convert(str(content_container))
50
  # -----------------------------------------------
51
 
52
  # Post-processing to clean up the resulting Markdown
@@ -59,6 +57,7 @@ class WebpageContentProcessor:
59
  except requests.exceptions.RequestException as e:
60
  return f"Error fetching the URL: {e}. Please check the URL and your connection."
61
  except Exception as e:
 
62
  return f"An unexpected error occurred during content processing: {e}"
63
 
64
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
@@ -323,4 +322,4 @@ with tab2:
323
  st.rerun()
324
 
325
  st.subheader("Final Compiled Document")
326
- st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
 
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
+ from markdownify import markdownify as md # <-- MODIFIED: Switched to markdownify
9
 
10
  # --- Core Logic Classes ---
11
 
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
  Fetches HTML content, removes common boilerplate tags from the entire page,
23
+ and then converts the remaining body content to Markdown using markdownify.
24
  """
25
  try:
26
  headers = {
 
42
  if not content_container:
43
  return "Error: Could not find the <body> of the webpage."
44
 
45
+ # --- MODIFIED: Switched to markdownify for conversion ---
46
+ # markdownify is a simple function call.
47
+ markdown_output = md(str(content_container))
 
 
48
  # -----------------------------------------------
49
 
50
  # Post-processing to clean up the resulting Markdown
 
57
  except requests.exceptions.RequestException as e:
58
  return f"Error fetching the URL: {e}. Please check the URL and your connection."
59
  except Exception as e:
60
+ # Added more specific error logging for debugging
61
  return f"An unexpected error occurred during content processing: {e}"
62
 
63
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
 
322
  st.rerun()
323
 
324
  st.subheader("Final Compiled Document")
325
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")