davidepanza commited on
Commit
9cbaedc
·
verified ·
1 Parent(s): ec294c1

Update app/pages/utils_chapter/chapter_extraction.py

Browse files
app/pages/utils_chapter/chapter_extraction.py CHANGED
@@ -3,6 +3,7 @@ from app.utils import debug_log, breaks
3
  from app.backend.get_requests import extract_chapters_from_toc
4
  from app.backend.raw_text_processing import extract_toc, extract_chapters
5
  from app.backend.text_processing import chapters_chunking
 
6
 
7
 
8
  def page_range_selector_ui():
@@ -54,10 +55,17 @@ def handle_page_range_submission(start_page, end_page):
54
 
55
 
56
  def extract_content_if_needed():
57
- toc_range = st.session_state.get("toc_page_range")
58
-
59
- # Extract TOC
60
- extract_toc(toc_range)
 
 
 
 
 
 
 
61
  debug_log(f"TOC preview: {st.session_state.get('toc', '')[:200]}...")
62
 
63
  # Extract chapters dictionary if not already present
 
3
  from app.backend.get_requests import extract_chapters_from_toc
4
  from app.backend.raw_text_processing import extract_toc, extract_chapters
5
  from app.backend.text_processing import chapters_chunking
6
+ from app.backend.toc_cleaning import extract_font_info, extract_lines_from_font_info, TextCleaner
7
 
8
 
9
  def page_range_selector_ui():
 
55
 
56
 
57
  def extract_content_if_needed():
58
+ """Extract TOC and chapters if not already done."""
59
+ toc_page_tuple = st.session_state.get("toc_page_range")
60
+ toc_page_range = range(toc_page_tuple[0], toc_page_tuple[1] + 1)
61
+ pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
62
+ debug_log(f"page range: {toc_page_range}")
63
+
64
+ # Extract raw toc and clean it
65
+ font_info = extract_font_info(pdf_bytes, toc_page_range)
66
+ lines = extract_lines_from_font_info(font_info)
67
+ cleaner = TextCleaner()
68
+ st.session_state['toc'] = cleaner.process(lines)
69
  debug_log(f"TOC preview: {st.session_state.get('toc', '')[:200]}...")
70
 
71
  # Extract chapters dictionary if not already present