heerjtdev commited on
Commit
56b6e36
Β·
verified Β·
1 Parent(s): 4d0f7f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -316
app.py CHANGED
@@ -1,325 +1,169 @@
1
- # import gradio as gr
2
- # print("GRADIO VERSION:", gr.__version__)
3
- # import json
4
- # import os
5
- # import tempfile
6
- # from pathlib import Path
7
-
8
- # # NOTE: You must ensure that 'working_yolo_pipeline.py' exists
9
- # # and defines the following items correctly:
10
- # from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
11
- # # Since I don't have this file, I am assuming the imports are correct.
12
-
13
- # # Define placeholders for assumed constants if the pipeline file isn't present
14
- # # You should replace these with your actual definitions if they are missing
15
- # try:
16
- # from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
17
- # except ImportError:
18
- # print("Warning: 'working_yolo_pipeline.py' not found. Using dummy paths.")
19
- # def run_document_pipeline(*args):
20
- # return {"error": "Placeholder pipeline function called."}
21
- # DEFAULT_LAYOUTLMV3_MODEL_PATH = "./models/layoutlmv3_model"
22
- # WEIGHTS_PATH = "./weights/yolo_weights.pt"
23
-
24
-
25
- # def process_pdf(pdf_file, layoutlmv3_model_path=None):
26
- # """
27
- # Wrapper function for Gradio interface.
28
-
29
- # Args:
30
- # pdf_file: Gradio UploadButton file object
31
- # layoutlmv3_model_path: Optional custom model path
32
-
33
- # Returns:
34
- # Tuple of (JSON string, download file path)
35
- # """
36
- # if pdf_file is None:
37
- # return "❌ Error: No PDF file uploaded.", None
38
-
39
- # # Use default model path if not provided
40
- # if not layoutlmv3_model_path:
41
- # layoutlmv3_model_path = DEFAULT_LAYOUTLMV3_MODEL_PATH
42
-
43
- # # Verify model and weights exist
44
- # if not os.path.exists(layoutlmv3_model_path):
45
- # return f"❌ Error: LayoutLMv3 model not found at {layoutlmv3_model_path}", None
46
-
47
- # if not os.path.exists(WEIGHTS_PATH):
48
- # return f"❌ Error: YOLO weights not found at {WEIGHTS_PATH}", None
49
-
50
- # try:
51
- # # Get the uploaded PDF path
52
- # pdf_path = pdf_file.name
53
-
54
- # # Run the pipeline
55
- # result = run_document_pipeline(pdf_path, layoutlmv3_model_path, 'label_studio_import.json')
56
-
57
- # if result is None:
58
- # return "❌ Error: Pipeline failed to process the PDF. Check console for details.", None
59
-
60
- # # Create a temporary file for download
61
- # output_filename = f"{Path(pdf_path).stem}_analysis.json"
62
- # temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
63
-
64
- # # Dump results to the temporary file
65
- # with open(temp_output.name, 'w', encoding='utf-8') as f:
66
- # json.dump(result, f, indent=2, ensure_ascii=False)
67
-
68
- # # Format JSON for display
69
- # json_display = json.dumps(result, indent=2, ensure_ascii=False)
70
-
71
- # return json_display, temp_output.name
72
-
73
- # except Exception as e:
74
- # return f"❌ Error during processing: {str(e)}", None
75
-
76
-
77
- # # Create Gradio interface
78
- # # FIX APPLIED: Removed 'theme=gr.themes.Soft()' which caused the TypeError
79
- # with gr.Blocks(title="Document Analysis Pipeline") as demo:
80
- # gr.Markdown("""
81
- # # πŸ“„ Document Analysis Pipeline
82
-
83
- # Upload a PDF document to extract structured data including questions, options, answers, passages, and embedded images.
84
-
85
- # **Pipeline Steps:**
86
- # 1. πŸ” YOLO/OCR Preprocessing (word extraction + figure/equation detection)
87
- # 2. πŸ€– LayoutLMv3 Inference (BIO tagging)
88
- # 3. πŸ“Š Structured JSON Decoding
89
- # 4. πŸ–ΌοΈ Base64 Image Embedding
90
- # """)
91
-
92
- # with gr.Row():
93
- # with gr.Column(scale=1):
94
- # pdf_input = gr.File(
95
- # label="Upload PDF Document",
96
- # file_types=[".pdf"],
97
- # type="filepath"
98
- # )
99
-
100
- # model_path_input = gr.Textbox(
101
- # label="LayoutLMv3 Model Path (optional)",
102
- # placeholder=DEFAULT_LAYOUTLMV3_MODEL_PATH,
103
- # value=DEFAULT_LAYOUTLMV3_MODEL_PATH,
104
- # interactive=True
105
- # )
106
-
107
- # process_btn = gr.Button("πŸš€ Process Document", variant="primary", size="lg")
108
-
109
- # gr.Markdown("""
110
- # ### ℹ️ Notes:
111
- # - Processing may take several minutes depending on PDF size
112
- # - Figures and equations will be extracted and embedded as Base64
113
- # - The output JSON includes structured questions, options, and answers
114
- # """)
115
-
116
- # with gr.Column(scale=2):
117
- # json_output = gr.Code(
118
- # label="Structured JSON Output",
119
- # language="json",
120
- # lines=25
121
- # )
122
-
123
- # download_output = gr.File(
124
- # label="Download Full JSON",
125
- # interactive=False
126
- # )
127
-
128
- # # Status/Examples section
129
- # with gr.Row():
130
- # gr.Markdown("""
131
- # ### πŸ“‹ Output Format
132
- # The pipeline generates JSON with the following structure:
133
- # - **Questions**: Extracted question text
134
- # - **Options**: Multiple choice options (A, B, C, D, etc.)
135
- # - **Answers**: Correct answer(s)
136
- # - **Passages**: Associated reading passages
137
- # - **Images**: Base64-encoded figures and equations (embedded with keys like `figure1`, `equation2`)
138
- # """)
139
-
140
- # # Connect the button to the processing function
141
- # process_btn.click(
142
- # fn=process_pdf,
143
- # inputs=[pdf_input, model_path_input],
144
- # outputs=[json_output, download_output],
145
- # api_name="process_document"
146
- # )
147
-
148
- # # Example section (optional - add example PDFs if available)
149
- # # gr.Examples(
150
- # # examples=[
151
- # # ["examples/sample1.pdf"],
152
- # # ["examples/sample2.pdf"],
153
- # # ],
154
- # # inputs=pdf_input,
155
- # # )
156
-
157
- # # Launch the app
158
- # if __name__ == "__main__":
159
- # demo.launch(
160
- # server_name="0.0.0.0",
161
- # server_port=7860,
162
- # share=False,
163
- # show_error=True
164
- # )
165
-
166
-
167
-
168
-
169
-
170
  import gradio as gr
171
- print("GRADIO VERSION:", gr.__version__)
172
- import json
173
- import os
174
- import tempfile
175
- from pathlib import Path
176
-
177
- # ==============================
178
- # WRITE CUSTOM CSS FOR FONTS
179
- # ==============================
180
-
181
- # CUSTOM_CSS = """
182
- # @font-face {
183
- # font-family: 'NotoSansMath';
184
- # src: url('./NotoSansMath-Regular.ttf') format('truetype');
185
- # font-weight: normal;
186
- # font-style: normal;
187
- # }
188
-
189
- # html, body, * {
190
- # font-family: 'NotoSansMath', sans-serif !important;
191
- # }
192
- # """
193
-
194
- # # Optionally write the CSS file if needed (not required for inline css)
195
- # if not os.path.exists("custom.css"):
196
- # with open("custom.css", "w") as f:
197
- # f.write(CUSTOM_CSS)
198
- # ==============================
199
-
200
- try:
201
- from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
202
- except ImportError:
203
- print("Warning: 'working_yolo_pipeline.py' not found. Using dummy paths.")
204
- def run_document_pipeline(*args):
205
- return {"error": "Placeholder pipeline function called."}
206
- DEFAULT_LAYOUTLMV3_MODEL_PATH = "./models/layoutlmv3_model"
207
- WEIGHTS_PATH = "./weights/yolo_weights.pt"
208
-
209
-
210
- def process_pdf(pdf_file, layoutlmv3_model_path=None):
211
- if pdf_file is None:
212
- return "❌ Error: No PDF file uploaded.", None
213
-
214
- if not layoutlmv3_model_path:
215
- layoutlmv3_model_path = DEFAULT_LAYOUTLMV3_MODEL_PATH
216
-
217
- if not os.path.exists(layoutlmv3_model_path):
218
- return f"❌ Error: LayoutLMv3 model not found at {layoutlmv3_model_path}", None
219
-
220
- if not os.path.exists(WEIGHTS_PATH):
221
- return f"❌ Error: YOLO weights not found at {WEIGHTS_PATH}", None
222
-
223
- try:
224
- pdf_path = pdf_file.name
225
-
226
- result = run_document_pipeline(pdf_path, layoutlmv3_model_path, 'label_studio_import.json')
227
-
228
- if result is None:
229
- return "❌ Error: Pipeline failed to process the PDF. Check console for details.", None
230
-
231
- output_filename = f"{Path(pdf_path).stem}_analysis.json"
232
- temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
233
-
234
- with open(temp_output.name, 'w', encoding='utf-8') as f:
235
- json.dump(result, f, indent=2, ensure_ascii=False)
236
-
237
- json_display = json.dumps(result, indent=2, ensure_ascii=False)
238
-
239
- return json_display, temp_output.name
240
-
241
- except Exception as e:
242
- return f"❌ Error during processing: {str(e)}", None
243
-
244
-
245
- with gr.Blocks(
246
- title="Document Analysis Pipeline"
247
- ) as demo:
248
-
249
-
250
- gr.HTML()
251
-
252
- gr.Markdown("""
253
- # πŸ“„ Document Analysis Pipeline
254
-
255
- Upload a PDF document to extract structured data including questions, options, answers, passages, and embedded images.
256
-
257
- **Pipeline Steps:**
258
- 1. πŸ” YOLO/OCR Preprocessing (word extraction + figure/equation detection)
259
- 2. πŸ€– LayoutLMv3 Inference (BIO tagging)
260
- 3. πŸ“Š Structured JSON Decoding
261
- 4. πŸ–ΌοΈ Base64 Image Embedding
262
- """)
263
-
264
- with gr.Row():
265
- with gr.Column(scale=1):
266
- pdf_input = gr.File(
267
- label="Upload PDF Document",
268
- file_types=[".pdf"],
269
- type="filepath"
270
- )
271
-
272
- model_path_input = gr.Textbox(
273
- label="LayoutLMv3 Model Path (optional)",
274
- placeholder=DEFAULT_LAYOUTLMV3_MODEL_PATH,
275
- value=DEFAULT_LAYOUTLMV3_MODEL_PATH,
276
- interactive=True
277
  )
278
-
279
- process_btn = gr.Button("πŸš€ Process Document", variant="primary", size="lg")
280
-
281
- gr.Markdown("""
282
- ### ℹ️ Notes:
283
- - Processing may take several minutes depending on PDF size
284
- - Figures and equations will be extracted and embedded as Base64
285
- - The output JSON includes structured questions, options, and answers
286
- """)
287
-
288
- with gr.Column(scale=2):
289
- json_output = gr.Code(
290
- label="Structured JSON Output",
291
- language="json",
292
- lines=25
293
- )
294
-
295
- download_output = gr.File(
296
- label="Download Full JSON",
297
- interactive=False
298
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  with gr.Row():
301
- gr.Markdown("""
302
- ### πŸ“‹ Output Format
303
- The pipeline generates JSON with the following structure:
304
- - **Questions**: Extracted question text
305
- - **Options**: Multiple choice options
306
- - **Answers**: Correct answer(s)
307
- - **Passages**: Associated reading passages
308
- - **Images**: Base64-encoded figures and equations
309
- """)
310
 
311
- process_btn.click(
312
- fn=process_pdf,
313
- inputs=[pdf_input, model_path_input],
314
- outputs=[json_output, download_output],
315
- api_name="process_document"
316
- )
317
 
 
 
318
 
319
  if __name__ == "__main__":
320
- demo.launch(
321
- server_name="0.0.0.0",
322
- server_port=7860,
323
- share=False,
324
- show_error=True
325
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import torch
4
+ import numpy as np
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_core.embeddings import Embeddings
8
+
9
+ # --- NEW IMPORTS FOR ONNX ---
10
+ from transformers import AutoTokenizer
11
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
12
+
13
+ # ---------------------------------------------------------
14
+ # Custom ONNX Embedding Class for BGE-Large
15
+ # ---------------------------------------------------------
16
+ class OnnxBgeEmbeddings(Embeddings):
17
+ def __init__(self, model_name="BAAI/bge-large-en-v1.5", file_name="model.onnx"):
18
+ print(f"πŸ”„ Loading {model_name} with ONNX Runtime...")
19
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+
21
+ # This loads the model and exports it to ONNX format automatically if not already done
22
+ self.model = ORTModelForFeatureExtraction.from_pretrained(
23
+ model_name,
24
+ export=True
25
+ )
26
+ self.model_name = model_name
27
+
28
+ def _process_batch(self, texts):
29
+ """Helper to tokenize and run inference via ONNX"""
30
+ # Tokenize
31
+ inputs = self.tokenizer(
32
+ texts,
33
+ padding=True,
34
+ truncation=True,
35
+ max_length=512,
36
+ return_tensors="pt"
37
+ )
38
+
39
+ # Run Inference (ONNX)
40
+ with torch.no_grad():
41
+ outputs = self.model(**inputs)
42
+
43
+ # BGE uses CLS pooling (first token), NOT mean pooling
44
+ # outputs.last_hidden_state shape: [batch_size, seq_len, hidden_dim]
45
+ embeddings = outputs.last_hidden_state[:, 0]
46
+
47
+ # Normalize embeddings (required for Cosine Similarity)
48
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
49
+
50
+ return embeddings.numpy().tolist()
51
+
52
+ def embed_documents(self, texts):
53
+ # BGE does NOT need instructions for documents
54
+ return self._process_batch(texts)
55
+
56
+ def embed_query(self, text):
57
+ # BGE REQUIRES this specific instruction for queries to work best
58
+ instruction = "Represent this sentence for searching relevant passages: "
59
+ return self._process_batch([instruction + text])[0]
60
+
61
+ # ---------------------------------------------------------
62
+ # Main Application Logic
63
+ # ---------------------------------------------------------
64
+ class VectorSystem:
65
+ def __init__(self):
66
+ self.vector_store = None
67
+ # SWITCHED to Custom ONNX Class
68
+ self.embeddings = OnnxBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5")
69
+ self.all_chunks = []
70
+
71
+ def process_file(self, file_obj):
72
+ """Extracts text, preserves order, and builds the Vector Index"""
73
+ if file_obj is None:
74
+ return "No file uploaded."
75
+
76
+ try:
77
+ # 1. Extract Text
78
+ text = ""
79
+ file_path = file_obj.name
80
+
81
+ if file_path.lower().endswith('.pdf'):
82
+ doc = fitz.open(file_path)
83
+ for page in doc: text += page.get_text()
84
+ elif file_path.lower().endswith('.txt'):
85
+ with open(file_path, 'r', encoding='utf-8') as f: text = f.read()
86
+ else:
87
+ return "❌ Error: Only .pdf and .txt files are supported."
88
+
89
+ # 2. Split Text
90
+ # Adjusted chunk size slightly for the larger model context, but 800 is still good
91
+ text_splitter = RecursiveCharacterTextSplitter(
92
+ chunk_size=800,
93
+ chunk_overlap=150,
94
+ separators=["\n\n", "\n", ".", " ", ""]
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
+ self.all_chunks = text_splitter.split_text(text)
97
+
98
+ if not self.all_chunks:
99
+ return "Could not extract text. Is the file empty?"
100
+
101
+ # 3. Build Vector Index
102
+ metadatas = [{"id": i} for i in range(len(self.all_chunks))]
103
+
104
+ self.vector_store = FAISS.from_texts(
105
+ self.all_chunks,
106
+ self.embeddings,
107
+ metadatas=metadatas
 
 
 
 
 
 
 
 
108
  )
109
+
110
+ return f"βœ… Success! Indexed {len(self.all_chunks)} chunks using BGE-Large (ONNX)."
111
+
112
+ except Exception as e:
113
+ return f"Error processing file: {str(e)}"
114
+
115
+ def retrieve_evidence(self, question, student_answer):
116
+ if not self.vector_store:
117
+ return "⚠️ Please upload and process a file first."
118
+
119
+ if not question:
120
+ return "⚠️ Please enter a Question."
121
+
122
+ # BGE is very accurate, so we search for top 3
123
+ results = self.vector_store.similarity_search_with_score(question, k=3)
124
+
125
+ output_text = "### πŸ” Expanded Context Analysis (Powered by BGE-Large ONNX):\n"
126
+
127
+ for i, (doc, score) in enumerate(results):
128
+ chunk_id = doc.metadata['id']
129
+
130
+ prev_chunk = self.all_chunks[chunk_id - 1] if chunk_id > 0 else "(Start of Text)"
131
+ next_chunk = self.all_chunks[chunk_id + 1] if chunk_id < len(self.all_chunks) - 1 else "(End of Text)"
132
+
133
+ # Note: FAISS returns L2 distance. Lower is better.
134
+ # With normalized vectors, L2 = 2 * (1 - CosineSimilarity).
135
+
136
+ output_text += f"\n#### 🎯 Match #{i+1} (Score: {score:.4f})\n"
137
+ output_text += f"> **Preceding Context:**\n{prev_chunk}\n\n"
138
+ output_text += f"> **MATCH:**\n**{doc.page_content}**\n\n"
139
+ output_text += f"> **Succeeding Context:**\n{next_chunk}\n"
140
+ output_text += "---\n"
141
+
142
+ return output_text
143
+
144
+ # Initialize System
145
+ system = VectorSystem()
146
+
147
+ # --- Gradio UI ---
148
+ with gr.Blocks(title="EduGenius Context Retriever") as demo:
149
+ gr.Markdown("# πŸŽ“ EduGenius: Smart Context Retriever")
150
+ gr.Markdown("Upload a Chapter. Powered by **BGE-Large (ONNX Accelerated)** for superior accuracy.")
151
 
152
  with gr.Row():
153
+ with gr.Column(scale=1):
154
+ pdf_input = gr.File(label="1. Upload File (PDF or TXT)", file_types=[".pdf", ".txt"])
155
+ upload_btn = gr.Button("Process File", variant="primary")
156
+ upload_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
157
 
158
+ with gr.Column(scale=2):
159
+ question_input = gr.Textbox(label="2. Question", placeholder="e.g., What causes the chemical reaction?")
160
+ answer_input = gr.Textbox(label="Student Answer (Optional)", placeholder="e.g., The heat causes it...")
161
+ search_btn = gr.Button("Find Context + Neighbors", variant="secondary")
162
+
163
+ evidence_output = gr.Markdown(label="Relevant Text Chunks")
164
 
165
+ upload_btn.click(fn=system.process_file, inputs=[pdf_input], outputs=[upload_status])
166
+ search_btn.click(fn=system.retrieve_evidence, inputs=[question_input, answer_input], outputs=[evidence_output])
167
 
168
  if __name__ == "__main__":
169
+ demo.launch()