Morinash commited on
Commit
6b8b552
·
verified ·
1 Parent(s): 18ef8c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -302
app.py CHANGED
@@ -11,11 +11,9 @@ import faiss
11
  import numpy as np
12
  from transformers import pipeline
13
  import logging
14
- import subprocess
15
- import shutil
16
- import re
17
 
18
- # PDF libraries with fallbacks
19
  try:
20
  from pypdf import PdfReader
21
  HAS_PYPDF = True
@@ -28,8 +26,6 @@ try:
28
  except:
29
  HAS_PDFPLUMBER = False
30
 
31
- HAS_PDFTOTEXT = shutil.which('pdftotext') is not None
32
-
33
  logging.basicConfig(level=logging.INFO)
34
  logger = logging.getLogger(__name__)
35
 
@@ -42,362 +38,307 @@ INDEX_PATH = "faiss_index.index"
42
  METADATA_PATH = "metadata.json"
43
 
44
  # Initialize models
45
- try:
46
- embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
47
- gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
48
- logger.info("Models loaded successfully")
49
- except Exception as e:
50
- logger.error(f"Model loading failed: {e}")
51
- raise
52
 
53
  # ==============================
54
- # SIMPLE TEXT SPLITTER (NO LANGCHAIN)
55
  # ==============================
56
  def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
57
- """Simple recursive text splitter without langchain"""
58
  if len(text) <= chunk_size:
59
  return [text.strip()]
60
 
61
  chunks = []
62
  start = 0
63
-
64
  while start < len(text):
65
- end = start + chunk_size
66
-
67
- # Try to split at sentence boundaries
68
- if end < len(text):
69
- # Look for sentence endings near chunk boundary
70
- for boundary in [end-50, end-20, end]:
71
- if boundary < len(text):
72
- # Find sentence breaks
73
- sentence_end = text.rfind('.', 0, boundary)
74
- sentence_end = max(sentence_end, text.rfind('!', 0, boundary))
75
- sentence_end = max(sentence_end, text.rfind('?', 0, boundary))
76
- sentence_end = max(sentence_end, text.rfind('\n\n', 0, boundary))
77
-
78
- if sentence_end > start + chunk_overlap:
79
- end = sentence_end + 1
80
- break
81
-
82
  chunk = text[start:end].strip()
83
- if len(chunk) > 50: # Only add substantial chunks
84
  chunks.append(chunk)
85
-
86
  start = end - chunk_overlap
87
-
88
- return chunks
89
 
90
  # ==============================
91
- # ROBUST PDF EXTRACTION
92
  # ==============================
93
- def extract_text_from_pdf_simple(file_path):
94
- """Try multiple methods to extract PDF text"""
95
- text = ""
96
 
97
- # Method 1: pdftotext (most reliable)
98
- if HAS_PDFTOTEXT:
99
- try:
100
- result = subprocess.run(
101
- ['pdftotext', '-layout', file_path, '-'],
102
- capture_output=True, text=True, timeout=20
103
- )
104
- if result.returncode == 0 and len(result.stdout.strip()) > 20:
105
- return result.stdout.strip()
106
- except Exception as e:
107
- logger.warning(f"pdftotext failed: {e}")
108
 
109
- # Method 2: pdfplumber
110
- if HAS_PDFPLUMBER:
111
- try:
112
- with pdfplumber.open(file_path) as pdf:
113
- for page in pdf.pages[:5]: # First 5 pages
114
- page_text = page.extract_text()
115
- if page_text:
116
- text += page_text + "\n\n"
117
- if len(text.strip()) > 50:
118
- return text.strip()
119
- except Exception as e:
120
- logger.warning(f"pdfplumber failed: {e}")
121
 
122
- # Method 3: pypdf with aggressive error handling
123
- if HAS_PYPDF:
124
- try:
125
- reader = PdfReader(file_path)
126
- for page in reader.pages[:3]: # First 3 pages only
127
- try:
128
- # Multiple extraction attempts
129
- page_text = None
130
- if hasattr(page, 'extract_text'):
131
- page_text = page.extract_text()
132
- elif hasattr(page, 'extractText'):
133
- page_text = page.extractText()
134
-
135
- if page_text and len(page_text.strip()) > 10:
136
- text += page_text + "\n\n"
137
- except:
138
- continue # Skip problematic pages
139
-
140
- if len(text.strip()) > 50:
141
- return text.strip()
142
- except Exception as e:
143
- logger.warning(f"pypdf failed: {e}")
144
 
145
- return f"PDF extraction failed - likely scanned images or corrupted file (size: {os.path.getsize(file_path)} bytes)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  # ==============================
148
- # OTHER FILE EXTRACTION
149
  # ==============================
150
- def extract_text_from_docx(file_path):
 
 
 
151
  try:
152
- doc = Document(file_path)
153
- return "\n\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
154
- except:
155
- return "DOCX extraction failed"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- def extract_text_from_excel(file_path):
 
 
 
158
  try:
159
- df = pd.read_excel(file_path, nrows=50)
160
- return df.to_string(index=False)
161
- except:
162
- return "Excel extraction failed"
163
-
164
- def extract_text_from_txt(file_path):
165
- encodings = ['utf-8', 'latin-1', 'cp1252']
166
- for enc in encodings:
167
- try:
168
- with open(file_path, 'r', encoding=enc, errors='ignore') as f:
169
- content = f.read().strip()
170
- if len(content) > 10:
171
- return content
172
- except:
173
- continue
174
- return "Text file reading failed"
 
175
 
176
- def extract_text_from_url(url):
177
  try:
178
- headers = {'User-Agent': 'Mozilla/5.0'}
179
- r = requests.get(url, headers=headers, timeout=10)
180
- soup = BeautifulSoup(r.text, 'html.parser')
181
- for tag in soup(['script', 'style']):
182
- tag.decompose()
183
- text = soup.get_text(separator='\n', strip=True)
184
- return ' '.join(text.split())[:3000]
 
185
  except:
186
- return "URL extraction failed"
187
 
188
  # ==============================
189
- # MAIN INGESTION FUNCTION
190
  # ==============================
191
  def ingest_sources(files, urls=""):
192
  docs = []
193
  metadata = []
194
  debug_info = []
195
 
196
- # Clear existing index
197
  for path in [INDEX_PATH, METADATA_PATH]:
198
  if os.path.exists(path):
199
- try:
200
- os.remove(path)
201
- debug_info.append(f"🗑️ Cleared {os.path.basename(path)}")
202
- except:
203
- pass
204
 
205
  # Process files
206
- for i, f in enumerate(files or []):
207
- try:
208
- name = getattr(f, 'name', f'file_{i+1}')
209
- debug_info.append(f"\n📄 Processing: {os.path.basename(name) if name else 'Unknown'}")
210
-
211
- # Create temp file
212
- ext = '.txt'
213
- if name:
214
- ext = os.path.splitext(name)[1] or '.txt'
215
-
216
- with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
217
- # Read file data
218
- data = None
219
- if hasattr(f, 'read'):
220
- data = f.read()
221
- if isinstance(data, str):
222
- data = data.encode('utf-8')
223
- elif isinstance(f, dict) and 'data' in f:
224
- data = f['data']
225
-
226
- if not data:
227
- debug_info.append("❌ No file data")
228
- continue
229
-
230
- tmp.write(data)
231
- tmp_path = tmp.name
232
-
233
- # Extract text
234
- file_ext = os.path.splitext(name.lower())[1] if name else ''
235
- text = ""
236
-
237
- if file_ext == '.pdf':
238
- text = extract_text_from_pdf_simple(tmp_path)
239
- elif file_ext in ['.docx', '.doc']:
240
- text = extract_text_from_docx(tmp_path)
241
- elif file_ext in ['.xlsx', '.xls', '.csv']:
242
- text = extract_text_from_excel(tmp_path)
243
- else:
244
- text = extract_text_from_txt(tmp_path)
245
-
246
- # Debug preview
247
- preview = text[:150].replace('\n', ' ').strip()
248
- if len(preview) > 100:
249
- preview = preview[:100] + "..."
250
-
251
- debug_info.append(f"📊 Extracted {len(text)} characters")
252
- debug_info.append(f"🔍 Preview: '{preview}'")
253
-
254
- # Create chunks if we have content
255
- if len(text.strip()) > 30 and not text.startswith(('PDF extraction failed', 'extraction failed')):
256
- chunks = simple_text_splitter(text)
257
- valid_chunks = [c for c in chunks if len(c.strip()) > 20]
258
-
259
- for j, chunk in enumerate(valid_chunks):
260
- docs.append(chunk)
261
- metadata.append({
262
- "source": os.path.basename(name) if name else f"file_{i+1}",
263
- "chunk": j,
264
- "text": chunk
265
- })
266
-
267
- debug_info.append(f"�� Created {len(valid_chunks)} chunks")
268
- else:
269
- debug_info.append("⚠️ Skipped: insufficient content or extraction error")
270
-
271
- # Cleanup
272
- try:
273
- os.unlink(tmp_path)
274
- except:
275
- pass
276
-
277
- except Exception as e:
278
- debug_info.append(f"💥 Error: {str(e)}")
279
- logger.error(f"File processing error: {e}")
280
-
281
- # Process URLs
282
- for url in urls.strip().split('\n'):
283
- url = url.strip()
284
- if url.startswith('http'):
285
- text = extract_text_from_url(url)
286
- if len(text.strip()) > 100:
287
- chunks = simple_text_splitter(text)
288
- for j, chunk in enumerate(chunks):
289
- if len(chunk.strip()) > 20:
290
- docs.append(chunk)
291
- metadata.append({
292
- "source": url,
293
- "chunk": j,
294
- "text": chunk,
295
- "type": "url"
296
- })
297
-
298
- debug_info.append(f"\n📈 TOTAL: {len(docs)} chunks created")
299
-
300
- if not docs:
301
- return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-12:])
302
-
303
- # Create FAISS index
304
- try:
305
- debug_info.append("🔄 Creating embeddings...")
306
- embeddings = embed_model.encode(docs, show_progress_bar=False)
307
- dimension = embeddings.shape[1]
308
- index = faiss.IndexFlatL2(dimension)
309
- index.add(embeddings.astype('float32'))
310
 
311
- faiss.write_index(index, INDEX_PATH)
312
- with open(METADATA_PATH, 'w', encoding='utf-8') as f:
313
- json.dump(metadata, f, ensure_ascii=False, indent=2)
 
 
314
 
315
- debug_info.append("✅ Index created successfully!")
316
- return f"🎉 SUCCESS! Ingested {len(docs)} chunks.\n\n" + "\n".join(debug_info[-6:])
 
317
 
318
- except Exception as e:
319
- debug_info.append(f"💥 Indexing failed: {str(e)}")
320
- return f"❌ Indexing error: {str(e)}\n\n" + "\n".join(debug_info[-6:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  # ==============================
323
  # RETRIEVAL & GENERATION
324
  # ==============================
325
  def retrieve_topk(query, k=3):
326
- try:
327
- if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
328
- return []
329
-
330
- q_emb = embed_model.encode([query], show_progress_bar=False)
331
- index = faiss.read_index(INDEX_PATH)
332
- distances, indices = index.search(q_emb.astype('float32'), k)
333
-
334
- with open(METADATA_PATH, 'r', encoding='utf-8') as f:
335
- metadata = json.load(f)
336
-
337
- return [metadata[i] for i in indices[0] if i < len(metadata)]
338
- except:
339
  return []
 
 
 
 
 
 
340
 
341
  def ask_prompt(query):
342
  hits = retrieve_topk(query)
343
  if not hits:
344
- return "No documents ingested or no relevant matches found."
345
-
346
- context = "\n\n".join([h.get('text', '')[:800] for h in hits])
 
347
  sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
348
-
349
- full_prompt = f"""Context:
350
- {context}
351
-
352
- Question: {query}
353
-
354
- Answer:"""
355
-
356
- try:
357
- result = gen_pipeline(full_prompt, max_length=300, do_sample=False)[0]['generated_text']
358
- answer = result.split('Answer:')[-1].strip() if 'Answer:' in result else result
359
- return f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
360
- except Exception as e:
361
- return f"Generation error: {str(e)}"
362
 
363
  # ==============================
364
- # GRADIO UI
365
  # ==============================
366
- with gr.Blocks(title="Document QA", theme=gr.themes.Soft()) as demo:
367
- gr.Markdown("# 🔍 Document Q&A Assistant")
368
- gr.Markdown("Upload PDFs, DOCX, Excel files or URLs to create a searchable knowledge base.")
369
-
370
  with gr.Row():
371
- with gr.Column(scale=1):
372
- file_input = gr.File(file_count="multiple", label="Upload Files")
373
- url_input = gr.Textbox(label="URLs (one per line)", lines=2)
374
- ingest_btn = gr.Button("🚀 Ingest Documents", variant="primary")
375
- status_output = gr.Textbox(label="Ingestion Status", lines=12)
376
-
377
- with gr.Column(scale=1):
378
- query_input = gr.Textbox(label="Ask a question about your documents", lines=3)
379
- ask_btn = gr.Button("💬 Get Answer", variant="secondary")
380
- answer_output = gr.Textbox(label="Answer", lines=10)
381
-
382
- # Events
383
- ingest_btn.click(
384
- ingest_sources,
385
- inputs=[file_input, url_input],
386
- outputs=status_output
387
- )
388
- ask_btn.click(
389
- ask_prompt,
390
- inputs=query_input,
391
- outputs=answer_output
392
- )
393
 
394
- gr.Markdown("### Tips:")
395
- gr.Markdown("""
396
- - **PDFs**: Works best with searchable PDFs (not scanned images)
397
- - **Scanned PDFs**: Convert to searchable text first using Adobe Acrobat or online OCR
398
- - **Large files**: Processing may take 1-2 minutes
399
- - **Test first**: Try with a simple text file to verify setup
400
- """)
401
 
402
  if __name__ == "__main__":
403
  demo.launch()
 
11
  import numpy as np
12
  from transformers import pipeline
13
  import logging
14
+ import io
 
 
15
 
16
+ # PDF libraries
17
  try:
18
  from pypdf import PdfReader
19
  HAS_PYPDF = True
 
26
  except:
27
  HAS_PDFPLUMBER = False
28
 
 
 
29
  logging.basicConfig(level=logging.INFO)
30
  logger = logging.getLogger(__name__)
31
 
 
38
  METADATA_PATH = "metadata.json"
39
 
40
  # Initialize models
41
+ embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
42
+ gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 
 
 
 
 
43
 
44
  # ==============================
45
+ # SIMPLE TEXT SPLITTER
46
  # ==============================
47
  def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
 
48
  if len(text) <= chunk_size:
49
  return [text.strip()]
50
 
51
  chunks = []
52
  start = 0
 
53
  while start < len(text):
54
+ end = min(start + chunk_size, len(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  chunk = text[start:end].strip()
56
+ if len(chunk) > 50:
57
  chunks.append(chunk)
 
58
  start = end - chunk_overlap
59
+ return [c for c in chunks if len(c) > 20]
 
60
 
61
  # ==============================
62
+ # CORRECTED FILE HANDLING FOR GRADIO
63
  # ==============================
64
+ def get_file_data(file_obj):
65
+ """Handle different Gradio file formats correctly"""
66
+ debug = []
67
 
68
+ # Method 1: File has .name attribute (temp file path)
69
+ if hasattr(file_obj, 'name') and file_obj.name:
70
+ debug.append(f"Using file path: {file_obj.name}")
71
+ return file_obj.name, "path"
 
 
 
 
 
 
 
72
 
73
+ # Method 2: File has .data attribute (base64 or bytes)
74
+ if hasattr(file_obj, 'data') and file_obj.data:
75
+ debug.append(f"Using file.data: {len(file_obj.data)} bytes")
76
+ return file_obj.data, "bytes"
 
 
 
 
 
 
 
 
77
 
78
+ # Method 3: Try to read as bytes
79
+ try:
80
+ if hasattr(file_obj, 'read'):
81
+ file_obj.seek(0) # Reset file pointer
82
+ data = file_obj.read()
83
+ if data:
84
+ debug.append(f"Read {len(data)} bytes from file object")
85
+ return data, "read"
86
+ except Exception as e:
87
+ debug.append(f"Read failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # Method 4: Check if it's a dict with content
90
+ if isinstance(file_obj, dict):
91
+ if 'data' in file_obj and file_obj['data']:
92
+ debug.append(f"Using dict data: {len(file_obj['data'])} bytes")
93
+ return file_obj['data'], "dict"
94
+ if 'name' in file_obj and file_obj['name']:
95
+ debug.append(f"Using dict path: {file_obj['name']}")
96
+ return file_obj['name'], "dict_path"
97
+
98
+ # Method 5: String path
99
+ if isinstance(file_obj, str) and os.path.exists(file_obj):
100
+ debug.append(f"Using string path: {file_obj}")
101
+ return file_obj, "string_path"
102
+
103
+ debug.append("❌ No valid file data found")
104
+ return None, debug
105
 
106
  # ==============================
107
+ # PDF EXTRACTION
108
  # ==============================
109
+ def extract_pdf_text(file_data, source_type, debug_info):
110
+ """Extract text from PDF using multiple methods"""
111
+ temp_path = None
112
+
113
  try:
114
+ # If we have a file path, use it directly
115
+ if source_type in ["path", "string_path", "dict_path"]:
116
+ file_path = file_data
117
+ if not os.path.exists(file_path):
118
+ debug_info.append(f"❌ File path doesn't exist: {file_path}")
119
+ return "File not found"
120
+
121
+ # Try pdftotext first (if available)
122
+ try:
123
+ import subprocess
124
+ result = subprocess.run(['pdftotext', file_path, '-'],
125
+ capture_output=True, text=True, timeout=15)
126
+ if result.returncode == 0 and len(result.stdout.strip()) > 30:
127
+ debug_info.append(f"✅ pdftotext: {len(result.stdout)} chars")
128
+ return result.stdout
129
+ except:
130
+ pass
131
+
132
+ # Create temp file from bytes
133
+ if source_type in ["bytes", "read", "dict"]:
134
+ temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
135
+ with open(temp_path, 'wb') as f:
136
+ if isinstance(file_data, str):
137
+ f.write(file_data.encode('latin1')) # PDFs are binary
138
+ else:
139
+ f.write(file_data)
140
+ file_path = temp_path
141
+ debug_info.append(f"Created temp file: {temp_path}")
142
+
143
+ # Try pdfplumber
144
+ if HAS_PDFPLUMBER:
145
+ try:
146
+ with pdfplumber.open(file_path) as pdf:
147
+ text = ""
148
+ for i, page in enumerate(pdf.pages[:5]):
149
+ page_text = page.extract_text()
150
+ if page_text:
151
+ text += page_text + "\n"
152
+ if len(text.strip()) > 50:
153
+ debug_info.append(f"✅ pdfplumber: {len(text)} chars")
154
+ return text
155
+ except Exception as e:
156
+ debug_info.append(f"pdfplumber failed: {e}")
157
+
158
+ # Try pypdf
159
+ if HAS_PYPDF:
160
+ try:
161
+ reader = PdfReader(file_path)
162
+ text = ""
163
+ for i, page in enumerate(reader.pages[:3]):
164
+ try:
165
+ page_text = page.extract_text()
166
+ if page_text and page_text.strip():
167
+ text += page_text + "\n"
168
+ except:
169
+ continue
170
+ if len(text.strip()) > 30:
171
+ debug_info.append(f"✅ pypdf: {len(text)} chars")
172
+ return text
173
+ except Exception as e:
174
+ debug_info.append(f"pypdf failed: {e}")
175
+
176
+ return "No text extracted - likely scanned PDF images"
177
+
178
+ finally:
179
+ if temp_path and os.path.exists(temp_path):
180
+ try:
181
+ os.unlink(temp_path)
182
+ except:
183
+ pass
184
 
185
+ # ==============================
186
+ # OTHER EXTRACTIONS
187
+ # ==============================
188
+ def extract_docx_text(file_data, source_type, debug_info):
189
  try:
190
+ if source_type == "path":
191
+ doc = Document(file_data)
192
+ else:
193
+ # Write to temp file
194
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
195
+ if isinstance(file_data, bytes):
196
+ tmp.write(file_data)
197
+ tmp_path = tmp.name
198
+ doc = Document(tmp_path)
199
+ os.unlink(tmp_path)
200
+
201
+ text = "\n\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
202
+ if len(text) > 20:
203
+ return text
204
+ return "No text in DOCX"
205
+ except Exception as e:
206
+ return f"DOCX error: {e}"
207
 
208
+ def extract_text_file(file_data, source_type, debug_info):
209
  try:
210
+ if source_type == "path":
211
+ with open(file_data, 'r', encoding='utf-8', errors='ignore') as f:
212
+ return f.read()
213
+ else:
214
+ # Decode bytes
215
+ if isinstance(file_data, bytes):
216
+ return file_data.decode('utf-8', errors='ignore')
217
+ return str(file_data)
218
  except:
219
+ return "Text extraction failed"
220
 
221
  # ==============================
222
+ # MAIN INGESTION
223
  # ==============================
224
  def ingest_sources(files, urls=""):
225
  docs = []
226
  metadata = []
227
  debug_info = []
228
 
229
+ # Clear existing
230
  for path in [INDEX_PATH, METADATA_PATH]:
231
  if os.path.exists(path):
232
+ os.remove(path)
 
 
 
 
233
 
234
  # Process files
235
+ for i, file_obj in enumerate(files or []):
236
+ debug_info.append(f"\n📄 Processing file {i+1}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ # Get file data correctly
239
+ file_data, source_info = get_file_data(file_obj)
240
+ if isinstance(source_info, list):
241
+ debug_info.extend(source_info)
242
+ continue
243
 
244
+ if not file_data:
245
+ debug_info.append(" No file data")
246
+ continue
247
 
248
+ # Get filename and extension
249
+ filename = getattr(file_obj, 'name', f'file_{i+1}')
250
+ if isinstance(filename, bytes):
251
+ filename = filename.decode('utf-8', errors='ignore')
252
+ ext = os.path.splitext(filename.lower())[1] if filename else ''
253
+
254
+ debug_info.append(f"File: {filename}, Type: {source_info}")
255
+
256
+ # Extract text
257
+ text = ""
258
+ if ext == '.pdf':
259
+ text = extract_pdf_text(file_data, source_info, debug_info)
260
+ elif ext in ['.docx', '.doc']:
261
+ text = extract_docx_text(file_data, source_info, debug_info)
262
+ elif ext in ['.txt', '.md']:
263
+ text = extract_text_file(file_data, source_info, debug_info)
264
+ else:
265
+ debug_info.append(f"Unknown extension: {ext}")
266
+ continue
267
+
268
+ # Preview
269
+ preview = text[:100].replace('\n', ' ').strip()
270
+ if len(preview) > 80:
271
+ preview = preview[:80] + "..."
272
+ debug_info.append(f"Extracted {len(text)} chars")
273
+ debug_info.append(f"Preview: '{preview}'")
274
+
275
+ # Create chunks
276
+ if len(text.strip()) > 30:
277
+ chunks = simple_text_splitter(text)
278
+ for j, chunk in enumerate(chunks):
279
+ docs.append(chunk)
280
+ metadata.append({
281
+ "source": filename,
282
+ "chunk": j,
283
+ "text": chunk
284
+ })
285
+ debug_info.append(f"✅ {len(chunks)} chunks created")
286
+ else:
287
+ debug_info.append("⚠️ Insufficient content")
288
+
289
+ debug_info.append(f"\n📊 Total: {len(docs)} chunks")
290
+
291
+ if docs:
292
+ embeddings = embed_model.encode(docs)
293
+ index = faiss.IndexFlatL2(embeddings.shape[1])
294
+ index.add(embeddings)
295
+ faiss.write_index(index, INDEX_PATH)
296
+ with open(METADATA_PATH, 'w') as f:
297
+ json.dump(metadata, f)
298
+ return f"✅ SUCCESS: {len(docs)} chunks!"
299
+
300
+ return "❌ No content.\n\n" + "\n".join(debug_info[-15:])
301
 
302
  # ==============================
303
  # RETRIEVAL & GENERATION
304
  # ==============================
305
  def retrieve_topk(query, k=3):
306
+ if not os.path.exists(INDEX_PATH):
 
 
 
 
 
 
 
 
 
 
 
 
307
  return []
308
+ q_emb = embed_model.encode([query])
309
+ index = faiss.read_index(INDEX_PATH)
310
+ D, I = index.search(q_emb, k)
311
+ with open(METADATA_PATH, 'r') as f:
312
+ metadata = json.load(f)
313
+ return [metadata[i] for i in I[0] if i < len(metadata)]
314
 
315
  def ask_prompt(query):
316
  hits = retrieve_topk(query)
317
  if not hits:
318
+ return "No documents found."
319
+ context = "\n\n".join([h['text'][:600] for h in hits])
320
+ prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
321
+ result = gen_pipeline(prompt, max_length=300)[0]['generated_text']
322
  sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
323
+ return f"{result}\n\nSources:\n" + "\n".join(sources)
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  # ==============================
326
+ # UI
327
  # ==============================
328
+ with gr.Blocks() as demo:
329
+ gr.Markdown("# 🔍 Document QA")
 
 
330
  with gr.Row():
331
+ with gr.Column():
332
+ file_input = gr.File(file_count="multiple")
333
+ ingest_btn = gr.Button("Ingest", variant="primary")
334
+ status = gr.Textbox(lines=15)
335
+ with gr.Column():
336
+ query_input = gr.Textbox(label="Question")
337
+ ask_btn = gr.Button("Ask")
338
+ answer = gr.Textbox(lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ ingest_btn.click(ingest_sources, [file_input, gr.State("")], status)
341
+ ask_btn.click(ask_prompt, query_input, answer)
 
 
 
 
 
342
 
343
  if __name__ == "__main__":
344
  demo.launch()