IW2025 commited on
Commit
55a16ef
·
verified ·
1 Parent(s): 20def20

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -357
app.py CHANGED
@@ -5,98 +5,69 @@ import fitz # PyMuPDF
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain.chains import RetrievalQA
9
- from langchain_community.llms import HuggingFacePipeline
10
- from langchain.prompts import PromptTemplate
11
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
- import torch
13
- from typing import List, Dict, Any
14
- import re
15
  import base64
16
  from PIL import Image
17
  import io
18
 
19
- class CurriculumAssistant:
20
- def __init__(self):
 
 
 
 
 
 
 
21
  self.vector_db = None
22
- self.qa_chain = None
23
  self.embeddings = None
24
- self.llm = None
25
- self.curriculum_docs = []
26
- self.pdf_pages = {} # Store page-level information
27
- self.pdf_files = {} # Store PDF file objects for page rendering
28
-
29
- def load_llm(self):
30
- """Load a better model for responses"""
31
- try:
32
- # Use a more capable model for better responses
33
- model_name = "microsoft/DialoGPT-large"
34
- tokenizer = AutoTokenizer.from_pretrained(model_name)
35
- model = AutoModelForCausalLM.from_pretrained(
36
- model_name,
37
- torch_dtype=torch.float16,
38
- device_map=None, # Use CPU for Hugging Face Spaces
39
- trust_remote_code=True
40
- )
41
-
42
- pipe = pipeline(
43
- "text-generation",
44
- model=model,
45
- tokenizer=tokenizer,
46
- max_new_tokens=200, # Increased for better responses
47
- temperature=0.7,
48
- top_p=0.95,
49
- repetition_penalty=1.15,
50
- do_sample=True,
51
- pad_token_id=tokenizer.eos_token_id
52
- )
53
-
54
- self.llm = HuggingFacePipeline(pipeline=pipe)
55
- return True
56
- except Exception as e:
57
- print(f"Error loading model: {str(e)}")
58
- return False
59
-
60
- def extract_text_from_pdf_with_pages(self, pdf_path: str) -> Dict[int, str]:
61
- """Extract text from PDF file with page numbers"""
62
- try:
63
- doc = fitz.open(pdf_path)
64
  pages = {}
65
  for page_num in range(len(doc)):
66
  page = doc[page_num]
67
  text = page.get_text()
68
- if text.strip(): # Only store non-empty pages
69
  pages[page_num + 1] = text.strip()
 
70
  doc.close()
71
- return pages
72
- except Exception as e:
73
- print(f"Error extracting text from {pdf_path}: {str(e)}")
74
- return {}
75
-
76
- def get_pdf_page_image(self, pdf_path: str, page_num: int) -> str:
77
- """Get a specific page from PDF as base64 image"""
 
 
 
 
 
 
 
 
 
 
 
78
  try:
79
  doc = fitz.open(pdf_path)
80
  if page_num <= len(doc):
81
- page = doc[page_num - 1] # Convert to 0-based index
82
- # Render page as image with higher quality
83
- mat = fitz.Matrix(1.5, 1.5) # Scale for better quality
84
  pix = page.get_pixmap(matrix=mat)
85
-
86
- # Convert to PIL Image for better handling
87
  img_data = pix.tobytes("png")
88
  img = Image.open(io.BytesIO(img_data))
89
-
90
- # Convert to RGB if needed
91
  if img.mode != 'RGB':
92
  img = img.convert('RGB')
93
-
94
- # Save to bytes
95
  img_byte_arr = io.BytesIO()
96
  img.save(img_byte_arr, format='PNG')
97
  img_byte_arr = img_byte_arr.getvalue()
98
-
99
- # Convert to base64
100
  img_base64 = base64.b64encode(img_byte_arr).decode()
101
  doc.close()
102
  return f"data:image/png;base64,{img_base64}"
@@ -105,299 +76,40 @@ class CurriculumAssistant:
105
  except Exception as e:
106
  print(f"Error rendering PDF page: {str(e)}")
107
  return None
108
-
109
- def process_curriculum(self, slides_dir: str):
110
- """Process all PDF files in the slides directory"""
111
- try:
112
- slides_path = Path(slides_dir)
113
- pdf_files = list(slides_path.glob("*.pdf"))
114
-
115
- if not pdf_files:
116
- print("No PDF files found in the Slides directory!")
117
- return False
118
-
119
- all_texts = []
120
- all_chunks_with_metadata = []
121
-
122
- for pdf_file in pdf_files:
123
- print(f"Processing: {pdf_file.name}")
124
-
125
- # Store PDF file path for later page rendering
126
- self.pdf_files[pdf_file.name] = str(pdf_file)
127
-
128
- # Extract text with page information
129
- pages = self.extract_text_from_pdf_with_pages(str(pdf_file))
130
- self.pdf_pages[pdf_file.name] = pages
131
-
132
- # Combine all pages for vector database
133
- full_text = "\n\n".join([f"Page {page_num}: {text}" for page_num, text in pages.items()])
134
-
135
- if full_text:
136
- all_texts.append(full_text)
137
- self.curriculum_docs.append({
138
- 'filename': pdf_file.name,
139
- 'content': full_text[:500] + "..." if len(full_text) > 500 else full_text,
140
- 'pages': pages
141
- })
142
-
143
- if not all_texts:
144
- print("No text could be extracted from PDF files!")
145
- return False
146
-
147
- # Split text into smaller chunks with metadata
148
- text_splitter = RecursiveCharacterTextSplitter(
149
- chunk_size=500, # Reduced from 1000
150
- chunk_overlap=50, # Reduced from 200
151
- length_function=len,
152
- )
153
-
154
- for i, text in enumerate(all_texts):
155
- chunks = text_splitter.split_text(text)
156
- for j, chunk in enumerate(chunks):
157
- # Add metadata to track which document and approximate page
158
- all_chunks_with_metadata.append({
159
- 'text': chunk,
160
- 'metadata': {
161
- 'filename': pdf_files[i].name,
162
- 'chunk_id': j,
163
- 'source': 'curriculum'
164
- }
165
- })
166
-
167
- # Create embeddings
168
- self.embeddings = HuggingFaceEmbeddings(
169
- model_name="sentence-transformers/all-MiniLM-L6-v2"
170
- )
171
-
172
- # Create vector database with metadata
173
- texts = [chunk['text'] for chunk in all_chunks_with_metadata]
174
- metadatas = [chunk['metadata'] for chunk in all_chunks_with_metadata]
175
-
176
- self.vector_db = Chroma.from_texts(
177
- texts=texts,
178
- embedding=self.embeddings,
179
- metadatas=metadatas,
180
- persist_directory="./chroma_db"
181
- )
182
-
183
- print(f"Processed {len(pdf_files)} curriculum documents!")
184
- return True
185
-
186
- except Exception as e:
187
- print(f"Error processing curriculum: {str(e)}")
188
- return False
189
-
190
- def create_qa_chain(self):
191
- """Create the QA chain with custom prompts"""
192
- if not self.vector_db or not self.llm:
193
- return False
194
-
195
- # Better prompt template for more detailed responses
196
- qa_template = """You are an expert programming instructor. Based on the curriculum context provided, answer the student's question in a clear and educational manner. Write a comprehensive paragraph that explains the concept thoroughly.
197
-
198
- Context: {context}
199
-
200
- Question: {question}
201
-
202
- Answer:"""
203
-
204
- self.qa_chain = RetrievalQA.from_chain_type(
205
- llm=self.llm,
206
- chain_type="stuff",
207
- retriever=self.vector_db.as_retriever(search_kwargs={"k": 3}), # Increased for better context
208
- chain_type_kwargs={
209
- "prompt": PromptTemplate(
210
- template=qa_template,
211
- input_variables=["context", "question"]
212
- )
213
- }
214
- )
215
-
216
- return True
217
-
218
- def find_relevant_pages(self, question: str, filename: str = None) -> List[Dict]:
219
- """Find relevant pages for a given question"""
220
- try:
221
- # Search for relevant chunks
222
- results = self.vector_db.similarity_search(question, k=5) # Increased for better coverage
223
-
224
- relevant_pages = []
225
- seen_pages = set()
226
-
227
- for result in results:
228
- metadata = result.metadata
229
- doc_filename = metadata.get('filename', '')
230
-
231
- # If filename is specified, only look in that file
232
- if filename and doc_filename != filename:
233
- continue
234
-
235
- # Extract page information from chunk text
236
- chunk_text = result.page_content
237
-
238
- # Look for page numbers in the chunk
239
- page_matches = re.findall(r'Page (\d+):', chunk_text)
240
-
241
- for page_num in page_matches:
242
- page_key = f"{doc_filename}_page_{page_num}"
243
- if page_key not in seen_pages:
244
- seen_pages.add(page_key)
245
-
246
- # Get the actual page content
247
- if doc_filename in self.pdf_pages:
248
- page_content = self.pdf_pages[doc_filename].get(int(page_num), "")
249
- if page_content:
250
- relevant_pages.append({
251
- 'filename': doc_filename,
252
- 'page_number': int(page_num),
253
- 'content': page_content,
254
- 'relevance_score': len(chunk_text) # Simple relevance metric
255
- })
256
-
257
- # Sort by relevance and return top results
258
- relevant_pages.sort(key=lambda x: x['relevance_score'], reverse=True)
259
- return relevant_pages[:3] # Return top 3 most relevant pages
260
-
261
- except Exception as e:
262
- print(f"Error finding relevant pages: {str(e)}")
263
- return []
264
 
265
- def initialize_system():
266
- """Initialize the curriculum assistant system"""
267
- assistant = CurriculumAssistant()
268
-
269
- # Load LLM
270
- if not assistant.load_llm():
271
- return "❌ Failed to load language model", None, None
272
-
273
- # Process curriculum
274
- if not assistant.process_curriculum("Slides"):
275
- return "❌ Failed to process curriculum documents", None, None
276
-
277
- # Create QA chain
278
- if not assistant.create_qa_chain():
279
- return "❌ Failed to create QA chain", None, None
280
-
281
- return "✅ System initialized successfully!", assistant, assistant.curriculum_docs
282
 
283
- def ask_question(question: str, assistant: CurriculumAssistant):
284
- """Ask a question and get answer with relevant pages"""
285
- if not assistant or not assistant.qa_chain:
286
- return "Please initialize the system first.", []
287
-
288
- try:
289
- # Get answer from QA chain using invoke instead of run
290
- answer = assistant.qa_chain.invoke({"query": question})
291
-
292
- # Find relevant pages
293
- relevant_pages = assistant.find_relevant_pages(question)
294
-
295
- # Format page information and get page images
296
- page_info = ""
297
- page_images = []
298
-
299
- if relevant_pages:
300
- page_info = "📄 **Relevant Pages Found:**\n\n"
301
- for i, page in enumerate(relevant_pages, 1):
302
- page_info += f"**{i}. {page['filename']} - Page {page['page_number']}**\n"
303
- page_info += f"```\n{page['content'][:300]}...\n```\n\n"
304
-
305
- # Get page image
306
- if page['filename'] in assistant.pdf_files:
307
- page_image = assistant.get_pdf_page_image(
308
- assistant.pdf_files[page['filename']],
309
- page['page_number']
310
- )
311
- if page_image:
312
- page_images.append((page_image, f"{page['filename']} - Page {page['page_number']}"))
313
- print(f"Added page image for {page['filename']} page {page['page_number']}")
314
- else:
315
- print(f"Failed to get page image for {page['filename']} page {page['page_number']}")
316
- else:
317
- page_info = "No specific pages found for this question."
318
-
319
- # Format the complete response
320
- full_response = f"## Answer\n\n{answer['result']}\n\n---\n\n{page_info}"
321
-
322
- return full_response, page_images
323
-
324
- except Exception as e:
325
- error_msg = f"Error processing question: {str(e)}"
326
- return error_msg, []
327
 
328
- # Initialize the system
329
- status, assistant, curriculum_docs = initialize_system()
 
 
 
 
330
 
331
- # Create Gradio interface
332
- with gr.Blocks(title="Inclusive World Curriculum Assistant", theme=gr.themes.Soft()) as demo:
333
- gr.Markdown("# 🎓 Inclusive World Curriculum Assistant")
334
- gr.Markdown("An AI-powered assistant that answers questions about your curriculum and shows relevant slide pages.")
335
-
336
  with gr.Row():
337
- with gr.Column(scale=2):
338
- # Status display
339
- status_display = gr.Textbox(
340
- value=status,
341
- label="System Status",
342
- interactive=False
343
- )
344
-
345
- # Question input
346
- question_input = gr.Textbox(
347
- label="Ask a question about your curriculum",
348
- placeholder="e.g., What are if statements? How do loops work?",
349
- lines=3
350
- )
351
-
352
- # Submit button
353
- submit_btn = gr.Button("🔍 Get Answer", variant="primary")
354
-
355
- # Answer output
356
- answer_output = gr.Markdown(
357
- label="Answer with Relevant Pages",
358
- value="Ask a question to get started!"
359
- )
360
-
361
- with gr.Column(scale=1):
362
- # Curriculum overview
363
- gr.Markdown("### 📚 Curriculum Documents")
364
- if curriculum_docs:
365
- for doc in curriculum_docs:
366
- with gr.Accordion(f"📄 {doc['filename']}", open=False):
367
- gr.Markdown(f"**Preview:** {doc['content']}")
368
- else:
369
- gr.Markdown("No curriculum documents loaded.")
370
-
371
- # Page images display
372
- with gr.Row():
373
- gr.Markdown("### 📄 Relevant Slide Pages")
374
- page_images_output = gr.Gallery(
375
- label="PDF Pages",
376
- show_label=True,
377
- elem_id="gallery",
378
- columns=2,
379
- rows=2,
380
- height="auto",
381
- object_fit="contain"
382
- )
383
-
384
- # Handle question submission
385
- def process_question(question):
386
- return ask_question(question, assistant)
387
-
388
- submit_btn.click(
389
- fn=process_question,
390
- inputs=[question_input],
391
- outputs=[answer_output, page_images_output]
392
- )
393
-
394
- # Handle Enter key in question input
395
- question_input.submit(
396
- fn=process_question,
397
- inputs=[question_input],
398
- outputs=[answer_output, page_images_output]
399
- )
400
 
401
- # Launch the app
402
  if __name__ == "__main__":
403
  demo.launch()
 
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import Chroma
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
 
 
8
  import base64
9
  from PIL import Image
10
  import io
11
 
12
+ # --- Minimal PDF Search & Display App ---
13
+
14
+ # 1. Preprocess PDFs and build vector DB
15
+ class FastPDFSearch:
16
+ def __init__(self, slides_dir="Slides"):
17
+ self.pdf_pages = {} # {filename: {page_num: text}}
18
+ self.pdf_files = {} # {filename: path}
19
+ self.chunks = []
20
+ self.chunk_metadata = []
21
  self.vector_db = None
 
22
  self.embeddings = None
23
+ self._process_pdfs(slides_dir)
24
+ self._build_vector_db()
25
+
26
+ def _process_pdfs(self, slides_dir):
27
+ slides_path = Path(slides_dir)
28
+ pdf_files = list(slides_path.glob("*.pdf"))
29
+ for pdf_file in pdf_files:
30
+ self.pdf_files[pdf_file.name] = str(pdf_file)
31
+ doc = fitz.open(str(pdf_file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  pages = {}
33
  for page_num in range(len(doc)):
34
  page = doc[page_num]
35
  text = page.get_text()
36
+ if text.strip():
37
  pages[page_num + 1] = text.strip()
38
+ self.pdf_pages[pdf_file.name] = pages
39
  doc.close()
40
+ # Add each page as a chunk
41
+ for page_num, text in pages.items():
42
+ self.chunks.append(text)
43
+ self.chunk_metadata.append({
44
+ "filename": pdf_file.name,
45
+ "page_number": page_num
46
+ })
47
+
48
+ def _build_vector_db(self):
49
+ self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
+ self.vector_db = Chroma.from_texts(
51
+ texts=self.chunks,
52
+ embedding=self.embeddings,
53
+ metadatas=self.chunk_metadata,
54
+ persist_directory="./chroma_db"
55
+ )
56
+
57
+ def get_pdf_page_image(self, pdf_path, page_num):
58
  try:
59
  doc = fitz.open(pdf_path)
60
  if page_num <= len(doc):
61
+ page = doc[page_num - 1]
62
+ mat = fitz.Matrix(1.5, 1.5)
 
63
  pix = page.get_pixmap(matrix=mat)
 
 
64
  img_data = pix.tobytes("png")
65
  img = Image.open(io.BytesIO(img_data))
 
 
66
  if img.mode != 'RGB':
67
  img = img.convert('RGB')
 
 
68
  img_byte_arr = io.BytesIO()
69
  img.save(img_byte_arr, format='PNG')
70
  img_byte_arr = img_byte_arr.getvalue()
 
 
71
  img_base64 = base64.b64encode(img_byte_arr).decode()
72
  doc.close()
73
  return f"data:image/png;base64,{img_base64}"
 
76
  except Exception as e:
77
  print(f"Error rendering PDF page: {str(e)}")
78
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ def search(self, query):
81
+ # Find the most relevant chunk (page)
82
+ results = self.vector_db.similarity_search(query, k=1)
83
+ if not results:
84
+ return "No relevant page found.", None, None
85
+ result = results[0]
86
+ filename = result.metadata["filename"]
87
+ page_number = result.metadata["page_number"]
88
+ text = result.page_content
89
+ img = self.get_pdf_page_image(self.pdf_files[filename], page_number)
90
+ return text, img, f"{filename} - Page {page_number}"
 
 
 
 
 
 
91
 
92
+ # --- Gradio UI ---
93
+ searcher = FastPDFSearch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ def gradio_search(query):
96
+ text, img, label = searcher.search(query)
97
+ if img:
98
+ return text, [(img, label)]
99
+ else:
100
+ return text, []
101
 
102
+ with gr.Blocks(title="Fast PDF Curriculum Search", theme=gr.themes.Soft()) as demo:
103
+ gr.Markdown("# 📄 Fast PDF Curriculum Search\nAsk a question and see the most relevant slide page!")
 
 
 
104
  with gr.Row():
105
+ with gr.Column():
106
+ question = gr.Textbox(label="Ask a question", placeholder="e.g., What are for loops?", lines=2)
107
+ submit = gr.Button("🔍 Search")
108
+ answer = gr.Markdown(label="Relevant Page Text")
109
+ with gr.Column():
110
+ gallery = gr.Gallery(label="Relevant PDF Page", columns=1, rows=1, height="auto", object_fit="contain")
111
+ submit.click(fn=gradio_search, inputs=question, outputs=[answer, gallery])
112
+ question.submit(fn=gradio_search, inputs=question, outputs=[answer, gallery])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
 
114
  if __name__ == "__main__":
115
  demo.launch()