Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 15, 2025

Commit

ff09cd6

verified ·

1 Parent(s): bff8b24

Create doc_loader.py

Browse files

Files changed (1) hide show

src/doc_loader.py +132 -0

src/doc_loader.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import io
+import fitz  # PyMuPDF
+import docx
+from pptx import Presentation
+import pandas as pd
+import base64
+from openai import OpenAI
+def extract_text_from_file(uploaded_file, use_vision=False, api_key=None):
+    """
+    Traffic Cop function.
+    If use_vision=True, it routes PDFs/PPTs to the Vision pipeline.
+    """
+    file_ext = os.path.splitext(uploaded_file.name)[1].lower()
+    # 1. Vision Path (Only for visual formats: PDF/PPT)
+    if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]:
+        if not api_key:
+            return "[ERROR: Vision Mode requires an API Key]"
+        return _extract_with_vision_model(uploaded_file, file_ext, api_key)
+    # 2. Standard Text Path (Fast, Free)
+    if file_ext == ".pdf":
+        return _extract_pdf(uploaded_file)
+    elif file_ext in [".docx", ".doc"]:
+        return _extract_docx(uploaded_file)
+    elif file_ext in [".pptx", ".ppt"]:
+        return _extract_pptx(uploaded_file)
+    elif file_ext in [".xlsx", ".xls", ".csv"]:
+        return _extract_excel(uploaded_file)
+    elif file_ext in [".txt", ".md"]:
+        return uploaded_file.read().decode("utf-8")
+    else:
+        raise ValueError(f"Unsupported file type: {file_ext}")
+# --- VISION EXTRACTION (The Heavy Lifter) ---
+def _extract_with_vision_model(uploaded_file, file_ext, api_key):
+    """
+    Converts file pages to images and asks GPT-4o to transcribe them
+    into a format compatible with the OutlineProcessor.
+    """
+    client = OpenAI(api_key=api_key)
+    full_text = []
+    # 1. Convert File to Image List
+    images = [] # List of base64 strings
+    if file_ext == ".pdf":
+        # Load PDF from memory
+        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity
+            img_bytes = pix.tobytes("png")
+            b64_img = base64.b64encode(img_bytes).decode('utf-8')
+            images.append(b64_img)
+    # (Note: PPTX vision support requires converting PPT slides to images.
+    # For simplicity, we fallback to standard extraction for PPTX in this prototype
+    # unless you install 'pdf2image' or similar heavy tools.
+    # For now, we'll treat PPTX as text-only or add a placeholder.)
+    elif file_ext in [".pptx", ".ppt"]:
+         return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file)
+    # 2. Process Batch (One API call per page to ensure accuracy)
+    # We loop through images. This is slower but handles context per page better.
+    for i, b64_img in enumerate(images):
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}}
+                    ],
+                }
+            ],
+            max_tokens=1000
+        )
+        content = response.choices[0].message.content
+        full_text.append(f"--- Page {i+1} ---\n{content}")
+    return "\n".join(full_text)
+# --- STANDARD EXTRACTORS (Existing Code) ---
+def _extract_pdf(uploaded_file):
+    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+    full_text = []
+    for page in doc:
+        full_text.append(page.get_text())
+    return "\n".join(full_text)
+def _extract_docx(uploaded_file):
+    doc = docx.Document(uploaded_file)
+    full_text = []
+    for para in doc.paragraphs:
+        if para.text.strip():
+            full_text.append(para.text)
+    for table in doc.tables:
+        for row in table.rows:
+            row_text = [cell.text for cell in row.cells if cell.text.strip()]
+            if row_text:
+                full_text.append(" | ".join(row_text))
+    return "\n".join(full_text)
+def _extract_pptx(uploaded_file):
+    prs = Presentation(uploaded_file)
+    full_text = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if hasattr(shape, "text") and shape.text.strip():
+                full_text.append(shape.text)
+        if slide.has_notes_slide:
+            notes = slide.notes_slide.notes_text_frame.text
+            if notes.strip():
+                full_text.append(f"[SPEAKER NOTES]: {notes}")
+    return "\n".join(full_text)
+def _extract_excel(uploaded_file):
+    is_csv = uploaded_file.name.lower().endswith(".csv")
+    if is_csv:
+        df = pd.read_csv(uploaded_file)
+    else:
+        df = pd.read_excel(uploaded_file)
+    try:
+        return df.to_markdown(index=False)
+    except:
+        return df.to_string(index=False)