NavyDevilDoc commited on
Commit
ff09cd6
·
verified ·
1 Parent(s): bff8b24

Create doc_loader.py

Browse files
Files changed (1) hide show
  1. src/doc_loader.py +132 -0
src/doc_loader.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import fitz # PyMuPDF
4
+ import docx
5
+ from pptx import Presentation
6
+ import pandas as pd
7
+ import base64
8
+ from openai import OpenAI
9
+
10
+ def extract_text_from_file(uploaded_file, use_vision=False, api_key=None):
11
+ """
12
+ Traffic Cop function.
13
+ If use_vision=True, it routes PDFs/PPTs to the Vision pipeline.
14
+ """
15
+ file_ext = os.path.splitext(uploaded_file.name)[1].lower()
16
+
17
+ # 1. Vision Path (Only for visual formats: PDF/PPT)
18
+ if use_vision and file_ext in [".pdf", ".pptx", ".ppt"]:
19
+ if not api_key:
20
+ return "[ERROR: Vision Mode requires an API Key]"
21
+ return _extract_with_vision_model(uploaded_file, file_ext, api_key)
22
+
23
+ # 2. Standard Text Path (Fast, Free)
24
+ if file_ext == ".pdf":
25
+ return _extract_pdf(uploaded_file)
26
+ elif file_ext in [".docx", ".doc"]:
27
+ return _extract_docx(uploaded_file)
28
+ elif file_ext in [".pptx", ".ppt"]:
29
+ return _extract_pptx(uploaded_file)
30
+ elif file_ext in [".xlsx", ".xls", ".csv"]:
31
+ return _extract_excel(uploaded_file)
32
+ elif file_ext in [".txt", ".md"]:
33
+ return uploaded_file.read().decode("utf-8")
34
+ else:
35
+ raise ValueError(f"Unsupported file type: {file_ext}")
36
+
37
+ # --- VISION EXTRACTION (The Heavy Lifter) ---
38
+
39
+ def _extract_with_vision_model(uploaded_file, file_ext, api_key):
40
+ """
41
+ Converts file pages to images and asks GPT-4o to transcribe them
42
+ into a format compatible with the OutlineProcessor.
43
+ """
44
+ client = OpenAI(api_key=api_key)
45
+ full_text = []
46
+
47
+ # 1. Convert File to Image List
48
+ images = [] # List of base64 strings
49
+
50
+ if file_ext == ".pdf":
51
+ # Load PDF from memory
52
+ doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
53
+ for page_num in range(len(doc)):
54
+ page = doc.load_page(page_num)
55
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for clarity
56
+ img_bytes = pix.tobytes("png")
57
+ b64_img = base64.b64encode(img_bytes).decode('utf-8')
58
+ images.append(b64_img)
59
+
60
+ # (Note: PPTX vision support requires converting PPT slides to images.
61
+ # For simplicity, we fallback to standard extraction for PPTX in this prototype
62
+ # unless you install 'pdf2image' or similar heavy tools.
63
+ # For now, we'll treat PPTX as text-only or add a placeholder.)
64
+ elif file_ext in [".pptx", ".ppt"]:
65
+ return "[System Note: Direct PPT Vision requires server-side rendering tools. Using Text Mode instead.]\n" + _extract_pptx(uploaded_file)
66
+
67
+ # 2. Process Batch (One API call per page to ensure accuracy)
68
+ # We loop through images. This is slower but handles context per page better.
69
+ for i, b64_img in enumerate(images):
70
+ response = client.chat.completions.create(
71
+ model="gpt-4o",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {"type": "text", "text": "Analyze this slide/page. Transcribe the content into a structured, hierarchical outline using markdown bullets (-). If there are tables, convert each row into a bullet point describing the data (e.g., '- The LM2500 has a weight of 4.7 tons'). If there are diagrams, describe the relationships labeled."},
77
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_img}"}}
78
+ ],
79
+ }
80
+ ],
81
+ max_tokens=1000
82
+ )
83
+ content = response.choices[0].message.content
84
+ full_text.append(f"--- Page {i+1} ---\n{content}")
85
+
86
+ return "\n".join(full_text)
87
+
88
+ # --- STANDARD EXTRACTORS (Existing Code) ---
89
+
90
+ def _extract_pdf(uploaded_file):
91
+ doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
92
+ full_text = []
93
+ for page in doc:
94
+ full_text.append(page.get_text())
95
+ return "\n".join(full_text)
96
+
97
+ def _extract_docx(uploaded_file):
98
+ doc = docx.Document(uploaded_file)
99
+ full_text = []
100
+ for para in doc.paragraphs:
101
+ if para.text.strip():
102
+ full_text.append(para.text)
103
+ for table in doc.tables:
104
+ for row in table.rows:
105
+ row_text = [cell.text for cell in row.cells if cell.text.strip()]
106
+ if row_text:
107
+ full_text.append(" | ".join(row_text))
108
+ return "\n".join(full_text)
109
+
110
+ def _extract_pptx(uploaded_file):
111
+ prs = Presentation(uploaded_file)
112
+ full_text = []
113
+ for slide in prs.slides:
114
+ for shape in slide.shapes:
115
+ if hasattr(shape, "text") and shape.text.strip():
116
+ full_text.append(shape.text)
117
+ if slide.has_notes_slide:
118
+ notes = slide.notes_slide.notes_text_frame.text
119
+ if notes.strip():
120
+ full_text.append(f"[SPEAKER NOTES]: {notes}")
121
+ return "\n".join(full_text)
122
+
123
+ def _extract_excel(uploaded_file):
124
+ is_csv = uploaded_file.name.lower().endswith(".csv")
125
+ if is_csv:
126
+ df = pd.read_csv(uploaded_file)
127
+ else:
128
+ df = pd.read_excel(uploaded_file)
129
+ try:
130
+ return df.to_markdown(index=False)
131
+ except:
132
+ return df.to_string(index=False)