Spaces:
Sleeping
Sleeping
Update app/pages/utils_chapter/chapter_extraction.py
Browse files
app/pages/utils_chapter/chapter_extraction.py
CHANGED
|
@@ -3,6 +3,7 @@ from app.utils import debug_log, breaks
|
|
| 3 |
from app.backend.get_requests import extract_chapters_from_toc
|
| 4 |
from app.backend.raw_text_processing import extract_toc, extract_chapters
|
| 5 |
from app.backend.text_processing import chapters_chunking
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def page_range_selector_ui():
|
|
@@ -54,10 +55,17 @@ def handle_page_range_submission(start_page, end_page):
|
|
| 54 |
|
| 55 |
|
| 56 |
def extract_content_if_needed():
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
debug_log(f"TOC preview: {st.session_state.get('toc', '')[:200]}...")
|
| 62 |
|
| 63 |
# Extract chapters dictionary if not already present
|
|
|
|
| 3 |
from app.backend.get_requests import extract_chapters_from_toc
|
| 4 |
from app.backend.raw_text_processing import extract_toc, extract_chapters
|
| 5 |
from app.backend.text_processing import chapters_chunking
|
| 6 |
+
from app.backend.toc_cleaning import extract_font_info, extract_lines_from_font_info, TextCleaner
|
| 7 |
|
| 8 |
|
| 9 |
def page_range_selector_ui():
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def extract_content_if_needed():
|
| 58 |
+
"""Extract TOC and chapters if not already done."""
|
| 59 |
+
toc_page_tuple = st.session_state.get("toc_page_range")
|
| 60 |
+
toc_page_range = range(toc_page_tuple[0], toc_page_tuple[1] + 1)
|
| 61 |
+
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
|
| 62 |
+
debug_log(f"page range: {toc_page_range}")
|
| 63 |
+
|
| 64 |
+
# Extract raw toc and clean it
|
| 65 |
+
font_info = extract_font_info(pdf_bytes, toc_page_range)
|
| 66 |
+
lines = extract_lines_from_font_info(font_info)
|
| 67 |
+
cleaner = TextCleaner()
|
| 68 |
+
st.session_state['toc'] = cleaner.process(lines)
|
| 69 |
debug_log(f"TOC preview: {st.session_state.get('toc', '')[:200]}...")
|
| 70 |
|
| 71 |
# Extract chapters dictionary if not already present
|