Morinash commited on
Commit
7bb5c98
·
verified ·
1 Parent(s): 723c9b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -356
app.py CHANGED
@@ -5,17 +5,32 @@ import json
5
  import pandas as pd
6
  import requests
7
  from bs4 import BeautifulSoup
8
- from pypdf import PdfReader
9
  from docx import Document
10
  from sentence_transformers import SentenceTransformer
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  import faiss
13
  import numpy as np
14
  from transformers import pipeline
15
- import traceback
16
  import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Set up logging
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
@@ -27,422 +42,321 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
27
  INDEX_PATH = "faiss_index.index"
28
  METADATA_PATH = "metadata.json"
29
 
30
- # Global variables
31
- embed_model = None
32
- splitter = None
33
- gen_pipeline = None
34
 
35
- def initialize_models():
36
- global embed_model, splitter, gen_pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
- embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
39
- splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
40
- gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
41
- logger.info("Models initialized successfully")
42
- return True
43
- except Exception as e:
44
- logger.error(f"Model initialization failed: {e}")
45
- return False
 
 
 
 
46
 
47
- # Initialize on startup
48
- if not initialize_models():
49
- raise RuntimeError("Failed to initialize models")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # ==============================
52
- # FILE EXTRACTION FUNCTIONS
53
  # ==============================
54
- def extract_text_from_pdf(file_path):
55
- """Extract text from PDF using pypdf"""
56
- try:
57
- logger.info(f"Extracting PDF from: {file_path}")
58
- reader = PdfReader(file_path)
59
- text = ""
60
- page_count = len(reader.pages)
61
-
62
- # Extract from first few pages only for speed
63
- for i, page in enumerate(reader.pages[:5]):
64
- try:
65
- page_text = page.extract_text()
66
- if page_text and page_text.strip():
67
- text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
68
- except Exception as e:
69
- logger.warning(f"Failed to extract page {i+1}: {e}")
70
- continue
71
-
72
- logger.info(f"PDF extraction complete: {len(text)} characters")
73
- return text.strip()
74
-
75
- except Exception as e:
76
- logger.error(f"PDF extraction error: {e}")
77
- return f"PDF extraction failed: {str(e)}"
78
-
79
  def extract_text_from_docx(file_path):
80
- """Extract text from DOCX"""
81
  try:
82
  doc = Document(file_path)
83
- paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
84
- text = "\n\n".join(paragraphs)
85
- logger.info(f"DOCX extraction: {len(paragraphs)} paragraphs")
86
- return text
87
- except Exception as e:
88
- logger.error(f"DOCX extraction error: {e}")
89
- return f"DOCX error: {str(e)}"
90
 
91
  def extract_text_from_excel(file_path):
92
- """Extract text from Excel (first sheet preview)"""
93
  try:
94
- df = pd.read_excel(file_path, sheet_name=0, nrows=50) # Limit rows
95
- text = f"Excel Sheet Preview ({df.shape[0]} rows):\n\n{df.fillna('').to_string(index=False)}"
96
- logger.info(f"Excel extraction: {df.shape}")
97
- return text
98
- except Exception as e:
99
- logger.error(f"Excel extraction error: {e}")
100
- return f"Excel error: {str(e)}"
101
 
102
  def extract_text_from_txt(file_path):
103
- """Extract text from plain text files"""
104
- encodings = ['utf-8', 'latin-1', 'cp1252']
105
- for encoding in encodings:
 
106
  try:
107
- with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
108
  return f.read()
109
  except:
110
- continue
111
- return "Could not read text file with available encodings"
112
 
113
  def extract_text_from_url(url):
114
- """Extract text from URL"""
115
  try:
116
- headers = {'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'}
117
- r = requests.get(url, timeout=15, headers=headers)
118
- r.raise_for_status()
119
-
120
  soup = BeautifulSoup(r.text, 'html.parser')
121
- for script in soup(["script", "style", "nav", "footer"]):
122
- script.decompose()
123
-
124
- text = soup.get_text(separator='\n', strip=True)
125
- # Clean up excessive whitespace
126
- lines = [line.strip() for line in text.split('\n') if line.strip()]
127
- return '\n'.join(lines[:100]) # Limit lines
128
- except Exception as e:
129
- logger.error(f"URL extraction error: {e}")
130
- return f"URL error: {str(e)}"
131
 
132
  # ==============================
133
- # MAIN INGESTION FUNCTION
134
  # ==============================
135
  def ingest_sources(files, urls):
136
- """Process files and URLs, create embeddings index"""
137
  docs = []
138
  metadata = []
139
  debug_info = []
140
 
141
- # Clear existing index for fresh ingestion
142
  for path in [INDEX_PATH, METADATA_PATH]:
143
  if os.path.exists(path):
144
  os.remove(path)
145
- debug_info.append(f"🗑️ Cleared existing {os.path.basename(path)}")
146
 
147
- # Process files
148
- processed_files = 0
149
  for f in files or []:
150
- processed_files += 1
151
- try:
152
- # Get filename
153
- name = getattr(f, 'name', f'file_{processed_files}')
154
- if not name:
155
- name = f'uploaded_file_{processed_files}'
156
-
157
- debug_info.append(f"\n📁 Processing: {os.path.basename(name)}")
158
-
159
- # Create temp file
160
- suffix = os.path.splitext(name)[1] or '.txt'
161
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir='/tmp') as tmp:
162
- # Handle different Gradio file formats
163
- file_data = None
164
  if hasattr(f, 'read'):
165
- file_data = f.read()
166
- if isinstance(file_data, str):
167
- file_data = file_data.encode('utf-8')
168
- elif isinstance(f, dict) and 'data' in f:
169
- file_data = f['data']
170
- if isinstance(file_data, str):
171
- file_data = file_data.encode('utf-8')
172
- elif isinstance(f, str):
173
- file_data = f.encode('utf-8')
174
-
175
- if not file_data:
176
- debug_info.append("❌ No file data available")
177
- continue
178
 
179
- tmp.write(file_data)
180
  tmp_path = tmp.name
181
- tmp.flush()
182
-
183
- # Extract text based on extension
184
- ext = os.path.splitext(name.lower())[1]
185
- text = ""
186
-
187
- if ext == '.pdf':
188
- text = extract_text_from_pdf(tmp_path)
189
- elif ext in ['.doc', '.docx']:
190
- text = extract_text_from_docx(tmp_path)
191
- elif ext in ['.xls', '.xlsx', '.csv']:
192
- text = extract_text_from_excel(tmp_path)
193
- else:
194
- text = extract_text_from_txt(tmp_path)
195
-
196
- # Show preview of extracted content
197
- preview = text[:200].replace('\n', ' ').strip()
198
- if len(preview) > 100:
199
- preview = preview[:100] + "..."
200
- debug_info.append(f"📄 Extracted {len(text)} chars")
201
- debug_info.append(f"🔍 Preview: '{preview}'")
202
-
203
- # Create chunks if we have substantial content
204
- if len(text.strip()) > 30 and not text.startswith(('error', 'PDF extraction failed')):
205
- chunks = splitter.split_text(text)
206
- valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
207
 
208
- for i, chunk in enumerate(valid_chunks):
209
- docs.append(chunk)
210
- metadata.append({
211
- "source": os.path.basename(name),
212
- "chunk_id": i,
213
- "type": "file",
214
- "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
215
- })
216
 
217
- debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
218
- else:
219
- debug_info.append("⚠️ Skipped: insufficient content or extraction error")
220
-
221
- # Cleanup
222
- try:
223
- os.unlink(tmp_path)
224
- except:
225
- pass
226
 
227
- except Exception as e:
228
- debug_info.append(f"💥 Error processing file: {str(e)}")
229
- logger.error(f"File processing error: {e}", exc_info=True)
230
-
231
- # Process URLs
232
- if urls and urls.strip():
233
- debug_info.append(f"\n🌐 Processing URLs:")
234
- for url_line in urls.strip().split('\n'):
235
- url = url_line.strip()
236
- if url.startswith('http'):
237
- debug_info.append(f" 📡 {url}")
238
- text = extract_text_from_url(url)
239
 
240
- if len(text.strip()) > 100 and not text.startswith('URL error'):
 
241
  chunks = splitter.split_text(text)
242
- valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
243
 
244
  for i, chunk in enumerate(valid_chunks):
245
  docs.append(chunk)
246
  metadata.append({
247
- "source": url,
248
- "chunk_id": i,
249
- "type": "url",
250
- "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
251
  })
252
 
253
- debug_info.append(f" ✅ Created {len(valid_chunks)} chunks from URL")
254
  else:
255
- debug_info.append(f" ⚠️ URL skipped: insufficient content")
 
 
 
 
 
 
256
 
257
- debug_info.append(f"\n📊 SUMMARY: {len(docs)} total chunks created")
258
 
259
- if not docs:
260
- return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-15:])
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- # Create FAISS index
263
- try:
264
- debug_info.append("🔄 Creating embeddings and index...")
265
- embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
266
- dimension = embeddings.shape[1]
267
- index = faiss.IndexFlatL2(dimension)
268
- index.add(embeddings.astype('float32'))
269
-
270
- # Save index and metadata
271
- faiss.write_index(index, INDEX_PATH)
272
- with open(METADATA_PATH, 'w', encoding='utf-8') as f:
273
- json.dump(metadata, f, ensure_ascii=False, indent=2)
274
-
275
- debug_info.append(f"✅ Index created successfully: {embeddings.shape[0]} vectors")
276
- return f"🎉 SUCCESS! Ingested {len(docs)} chunks from {processed_files} files.\n\n" + "\n".join(debug_info[-8:])
277
-
278
- except Exception as e:
279
- debug_info.append(f"💥 Index creation failed: {str(e)}")
280
- logger.error(f"Index creation error: {e}", exc_info=True)
281
- return f"❌ Indexing failed: {str(e)}\n\n" + "\n".join(debug_info[-10:])
282
 
283
  # ==============================
284
- # RETRIEVAL
285
  # ==============================
286
- def retrieve_topk(query, k=5):
287
- """Retrieve top k relevant chunks"""
288
- try:
289
- if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
290
- return []
291
-
292
- query_embedding = embed_model.encode([query], convert_to_numpy=True)
293
- index = faiss.read_index(INDEX_PATH)
294
- distances, indices = index.search(query_embedding.astype('float32'), k)
295
-
296
- with open(METADATA_PATH, 'r', encoding='utf-8') as f:
297
- metadata = json.load(f)
298
-
299
- results = []
300
- for i, idx in enumerate(indices[0]):
301
- if idx < len(metadata):
302
- results.append({
303
- **metadata[idx],
304
- "distance": float(distances[0][i])
305
- })
306
-
307
- return results[:k]
308
- except Exception as e:
309
- logger.error(f"Retrieval error: {e}")
310
  return []
 
 
 
 
 
 
 
 
311
 
312
- # ==============================
313
- # GENERATION
314
- # ==============================
315
  def ask_prompt(query):
316
- """Generate answer based on retrieved context"""
317
- try:
318
- hits = retrieve_topk(query, k=3)
319
- if not hits:
320
- return "No relevant documents found. Please ingest some files first."
321
-
322
- # Build context from top hits
323
- context_parts = []
324
- sources = []
325
- for hit in hits:
326
- content = hit.get('content_preview', '') or ''
327
- if len(content) > 50:
328
- context_parts.append(content)
329
- source_info = f"{hit['source']} (chunk {hit['chunk_id']})"
330
- if hit.get('distance'):
331
- source_info += f" [relevance: {hit['distance']:.3f}]"
332
- sources.append(source_info)
333
-
334
- if not context_parts:
335
- return "Retrieved documents but no content available."
336
-
337
- context = "\n\n".join(context_parts)
338
- full_prompt = f"""Based on the following context, answer the question.
339
-
340
- Context:
341
- {context}
342
-
343
- Question: {query}
344
-
345
- Answer:"""
346
-
347
- # Generate response
348
- result = gen_pipeline(
349
- full_prompt,
350
- max_length=400,
351
- min_length=50,
352
- do_sample=False,
353
- temperature=0.1
354
- )[0]['generated_text']
355
-
356
- # Extract just the answer part
357
- if "Answer:" in result:
358
- answer = result.split("Answer:", 1)[1].strip()
359
- else:
360
- answer = result
361
-
362
- response = f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
363
- return response
364
-
365
- except Exception as e:
366
- logger.error(f"Generation error: {e}")
367
- return f"Error generating response: {str(e)}"
368
 
369
  # ==============================
370
  # GRADIO UI
371
  # ==============================
372
- def create_ui():
373
- with gr.Blocks(title="Research Assistant", theme=gr.themes.Soft()) as demo:
374
- gr.Markdown("""
375
- # 🔍 Research Assistant
376
- Upload documents and ask questions about their content.
377
- """)
378
-
379
- with gr.Row():
380
- with gr.Column(scale=1):
381
- gr.Markdown("### 📤 Document Ingestion")
382
- file_input = gr.File(
383
- label="Upload Files",
384
- file_count="multiple",
385
- file_types=[".pdf", ".docx", ".doc", ".txt", ".xlsx", ".xls", ".csv"]
386
- )
387
- url_input = gr.Textbox(
388
- label="Or paste URLs (one per line)",
389
- placeholder="https://example.com/document\nhttps://another-site.com/page",
390
- lines=3
391
- )
392
- ingest_button = gr.Button("🚀 Ingest Documents", variant="primary", size="lg")
393
- status_output = gr.Textbox(
394
- label="Ingestion Status",
395
- lines=12,
396
- interactive=False
397
- )
398
-
399
- with gr.Column(scale=1):
400
- gr.Markdown("### ❓ Ask Questions")
401
- query_input = gr.Textbox(
402
- label="Your Question",
403
- placeholder="What does the document say about...",
404
- lines=3
405
- )
406
- ask_button = gr.Button("💬 Get Answer", variant="secondary")
407
- answer_output = gr.Textbox(
408
- label="Answer",
409
- lines=12,
410
- interactive=False
411
- )
412
-
413
- # Event handlers
414
- ingest_button.click(
415
- ingest_sources,
416
- inputs=[file_input, url_input],
417
- outputs=status_output
418
- )
419
-
420
- ask_button.click(
421
- ask_prompt,
422
- inputs=query_input,
423
- outputs=answer_output
424
- )
425
 
426
- # Examples
427
- gr.Examples(
428
- examples=[
429
- ["What is the main topic of the documents?"],
430
- ["Summarize the key points."],
431
- ["What are the dates mentioned?"],
432
- ],
433
- inputs=query_input
434
- )
435
 
436
- return demo
 
437
 
438
- # ==============================
439
- # MAIN
440
- # ==============================
441
  if __name__ == "__main__":
442
- demo = create_ui()
443
- demo.launch(
444
- server_name="0.0.0.0",
445
- server_port=7860,
446
- share=False, # Set to True for public sharing
447
- debug=True
448
- )
 
5
  import pandas as pd
6
  import requests
7
  from bs4 import BeautifulSoup
 
8
  from docx import Document
9
  from sentence_transformers import SentenceTransformer
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  import faiss
12
  import numpy as np
13
  from transformers import pipeline
 
14
  import logging
15
+ import subprocess
16
+ import shutil
17
+
18
+ # Try importing PDF libraries with fallbacks
19
+ try:
20
+ from pypdf import PdfReader
21
+ HAS_PYPDF = True
22
+ except:
23
+ HAS_PYPDF = False
24
+
25
+ try:
26
+ import pdfplumber
27
+ HAS_PDFPLUMBER = True
28
+ except:
29
+ HAS_PDFPLUMBER = False
30
+
31
+ # Fallback: use pdftotext if available (common on Linux systems)
32
+ HAS_PDFTOTEXT = shutil.which('pdftotext') is not None
33
 
 
34
  logging.basicConfig(level=logging.INFO)
35
  logger = logging.getLogger(__name__)
36
 
 
42
  INDEX_PATH = "faiss_index.index"
43
  METADATA_PATH = "metadata.json"
44
 
45
+ embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
46
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
47
+ gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 
48
 
49
+ # ==============================
50
+ # ROBUST PDF EXTRACTION WITH FALLBACKS
51
+ # ==============================
52
+ def extract_text_from_pdf_robust(file_path):
53
+ """Try multiple PDF extraction methods in order of reliability"""
54
+ methods_tried = []
55
+ text = ""
56
+
57
+ debug_info = []
58
+
59
+ # Method 1: pdftotext (most reliable, system command)
60
+ if HAS_PDFTOTEXT:
61
+ try:
62
+ debug_info.append("Trying pdftotext...")
63
+ result = subprocess.run(
64
+ ['pdftotext', '-layout', file_path, '-'],
65
+ capture_output=True,
66
+ text=True,
67
+ timeout=30
68
+ )
69
+ if result.returncode == 0 and result.stdout.strip():
70
+ text = result.stdout
71
+ debug_info.append(f"✅ pdftotext success: {len(text)} chars")
72
+ return text, debug_info
73
+ except Exception as e:
74
+ debug_info.append(f"pdftotext failed: {e}")
75
+
76
+ # Method 2: pdfplumber (good for complex layouts)
77
+ if HAS_PDFPLUMBER:
78
+ try:
79
+ debug_info.append("Trying pdfplumber...")
80
+ with pdfplumber.open(file_path) as pdf:
81
+ for i, page in enumerate(pdf.pages[:10]): # Limit pages
82
+ page_text = page.extract_text()
83
+ if page_text:
84
+ text += f"\n--- Page {i+1} ---\n{page_text}"
85
+ if len(text.strip()) > 50:
86
+ debug_info.append(f"✅ pdfplumber success: {len(text)} chars")
87
+ return text, debug_info
88
+ except Exception as e:
89
+ debug_info.append(f"pdfplumber failed: {e}")
90
+
91
+ # Method 3: pypdf with error handling
92
+ if HAS_PYPDF:
93
+ try:
94
+ debug_info.append("Trying pypdf...")
95
+ reader = PdfReader(file_path)
96
+ page_count = len(reader.pages)
97
+
98
+ for i, page in enumerate(reader.pages[:5]): # First 5 pages only
99
+ try:
100
+ # Try different extraction methods
101
+ if hasattr(page, 'extract_text'):
102
+ page_text = page.extract_text()
103
+ elif hasattr(page, 'extractText'):
104
+ page_text = page.extractText()
105
+ else:
106
+ continue
107
+
108
+ if page_text and page_text.strip():
109
+ text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
110
+ except Exception as page_e:
111
+ debug_info.append(f"Page {i+1} failed: {page_e}")
112
+ continue
113
+
114
+ if len(text.strip()) > 50:
115
+ debug_info.append(f"✅ pypdf success: {len(text)} chars")
116
+ return text, debug_info
117
+ else:
118
+ debug_info.append(f"pypdf extracted only {len(text)} chars")
119
+ except Exception as e:
120
+ debug_info.append(f"pypdf failed: {e}")
121
+
122
+ # Method 4: Check if it's just an image/scanned PDF
123
  try:
124
+ file_size = os.path.getsize(file_path)
125
+ with open(file_path, 'rb') as f:
126
+ header = f.read(1024)
127
+ if b'%PDF' not in header:
128
+ return "Invalid PDF format", debug_info
129
+ if b'/Encrypt' in header:
130
+ return "PDF is password protected", debug_info
131
+ except:
132
+ pass
133
+
134
+ debug_info.append("❌ All PDF methods failed")
135
+ return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned images.", debug_info
136
 
137
+ def extract_text_from_pdf_simple(file_path):
138
+ """Simplified fallback - just try to get ANY text"""
139
+ all_text = ""
140
+
141
+ # Try pdftotext first (most reliable)
142
+ if HAS_PDFTOTEXT:
143
+ try:
144
+ result = subprocess.run(
145
+ ['pdftotext', file_path, '-'],
146
+ capture_output=True, text=True, timeout=10
147
+ )
148
+ if result.returncode == 0:
149
+ all_text = result.stdout
150
+ if len(all_text.strip()) > 20:
151
+ return all_text
152
+ except:
153
+ pass
154
+
155
+ # Try pdfplumber
156
+ if HAS_PDFPLUMBER:
157
+ try:
158
+ import pdfplumber
159
+ with pdfplumber.open(file_path) as pdf:
160
+ for page in pdf.pages[:3]:
161
+ text = page.extract_text()
162
+ if text:
163
+ all_text += text + "\n"
164
+ if len(all_text.strip()) > 20:
165
+ return all_text
166
+ except:
167
+ pass
168
+
169
+ # Last resort: pypdf with minimal error handling
170
+ if HAS_PYPDF:
171
+ try:
172
+ reader = PdfReader(file_path)
173
+ for page in reader.pages[:2]:
174
+ try:
175
+ text = page.extract_text()
176
+ if text and len(text.strip()) > 10:
177
+ all_text += text + "\n"
178
+ except:
179
+ continue
180
+ return all_text
181
+ except:
182
+ pass
183
+
184
+ return "PDF extraction completely failed - likely scanned images with no text layer"
185
 
186
  # ==============================
187
+ # OTHER EXTRACTION FUNCTIONS
188
  # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  def extract_text_from_docx(file_path):
 
190
  try:
191
  doc = Document(file_path)
192
+ return "\n\n".join([p.text for p in doc.paragraphs if p.text.strip()])
193
+ except:
194
+ return "DOCX extraction failed"
 
 
 
 
195
 
196
  def extract_text_from_excel(file_path):
 
197
  try:
198
+ df = pd.read_excel(file_path, nrows=30)
199
+ return df.to_string()
200
+ except:
201
+ return "Excel extraction failed"
 
 
 
202
 
203
  def extract_text_from_txt(file_path):
204
+ try:
205
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
206
+ return f.read()
207
+ except:
208
  try:
209
+ with open(file_path, 'r', encoding='latin-1') as f:
210
  return f.read()
211
  except:
212
+ return "Text extraction failed"
 
213
 
214
  def extract_text_from_url(url):
 
215
  try:
216
+ r = requests.get(url, timeout=10)
 
 
 
217
  soup = BeautifulSoup(r.text, 'html.parser')
218
+ return soup.get_text(separator='\n', strip=True)[:2000]
219
+ except:
220
+ return "URL extraction failed"
 
 
 
 
 
 
 
221
 
222
  # ==============================
223
+ # MAIN INGESTION
224
  # ==============================
225
  def ingest_sources(files, urls):
 
226
  docs = []
227
  metadata = []
228
  debug_info = []
229
 
230
+ # Clear existing index
231
  for path in [INDEX_PATH, METADATA_PATH]:
232
  if os.path.exists(path):
233
  os.remove(path)
 
234
 
235
+ processed = 0
 
236
  for f in files or []:
237
+ processed += 1
238
+ name = getattr(f, 'name', f'file_{processed}')
239
+ debug_info.append(f"\n📄 Processing: {os.path.basename(name)}")
240
+
241
+ # Save to temp file
242
+ suffix = os.path.splitext(name)[1] or '.pdf'
243
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
244
+ try:
 
 
 
 
 
 
245
  if hasattr(f, 'read'):
246
+ data = f.read()
247
+ else:
248
+ data = f if isinstance(f, bytes) else str(f).encode()
 
 
 
 
 
 
 
 
 
 
249
 
250
+ tmp.write(data)
251
  tmp_path = tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ ext = os.path.splitext(name.lower())[1]
254
+ text = ""
 
 
 
 
 
 
255
 
256
+ if ext == '.pdf':
257
+ text = extract_text_from_pdf_simple(tmp_path)
258
+ elif ext == '.docx':
259
+ text = extract_text_from_docx(tmp_path)
260
+ elif ext in ['.xls', '.xlsx']:
261
+ text = extract_text_from_excel(tmp_path)
262
+ else:
263
+ text = extract_text_from_txt(tmp_path)
 
264
 
265
+ preview = text[:150].replace('\n', ' ').strip()
266
+ if len(preview) > 100:
267
+ preview = preview[:100] + "..."
268
+
269
+ debug_info.append(f"Extracted {len(text)} chars")
270
+ debug_info.append(f"Preview: '{preview}'")
 
 
 
 
 
 
271
 
272
+ # Accept ANY substantial text
273
+ if len(text.strip()) > 25:
274
  chunks = splitter.split_text(text)
275
+ valid_chunks = [c for c in chunks if len(c.strip()) > 15]
276
 
277
  for i, chunk in enumerate(valid_chunks):
278
  docs.append(chunk)
279
  metadata.append({
280
+ "source": os.path.basename(name),
281
+ "chunk": i,
282
+ "text": chunk[:900] # Limit stored text
 
283
  })
284
 
285
+ debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
286
  else:
287
+ debug_info.append("⚠️ Too little content")
288
+
289
+ finally:
290
+ try:
291
+ os.unlink(tmp.name)
292
+ except:
293
+ pass
294
 
295
+ debug_info.append(f"\n📊 Total chunks: {len(docs)}")
296
 
297
+ if docs:
298
+ try:
299
+ embeddings = embed_model.encode(docs)
300
+ index = faiss.IndexFlatL2(embeddings.shape[1])
301
+ index.add(embeddings)
302
+ faiss.write_index(index, INDEX_PATH)
303
+
304
+ with open(METADATA_PATH, 'w') as f:
305
+ json.dump(metadata, f)
306
+
307
+ return f"✅ SUCCESS: {len(docs)} chunks indexed!"
308
+ except Exception as e:
309
+ return f"❌ Indexing failed: {e}"
310
 
311
+ return "❌ No valid content.\n\n" + "\n".join(debug_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
  # ==============================
314
+ # RETRIEVAL & GENERATION (unchanged)
315
  # ==============================
316
+ def retrieve_topk(query, k=3):
317
+ if not os.path.exists(INDEX_PATH):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  return []
319
+ q_emb = embed_model.encode([query])
320
+ index = faiss.read_index(INDEX_PATH)
321
+ D, I = index.search(q_emb, k)
322
+
323
+ with open(METADATA_PATH, 'r') as f:
324
+ metadata = json.load(f)
325
+
326
+ return [metadata[i] for i in I[0] if i < len(metadata)]
327
 
 
 
 
328
  def ask_prompt(query):
329
+ hits = retrieve_topk(query)
330
+ if not hits:
331
+ return "No documents ingested."
332
+
333
+ context = "\n\n".join([h['text'] for h in hits])
334
+ prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
335
+
336
+ result = gen_pipeline(prompt, max_length=300)[0]['generated_text']
337
+ sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
338
+
339
+ return f"{result}\n\nSources:\n" + "\n".join(sources)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  # ==============================
342
  # GRADIO UI
343
  # ==============================
344
+ with gr.Blocks() as demo:
345
+ gr.Markdown("# 🔍 Document QA - Fixed PDF Extraction")
346
+
347
+ with gr.Row():
348
+ with gr.Column():
349
+ file_input = gr.File(file_count="multiple")
350
+ ingest_btn = gr.Button("Ingest", variant="primary")
351
+ status = gr.Textbox(label="Status", lines=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
+ with gr.Column():
354
+ query_input = gr.Textbox(label="Question")
355
+ ask_btn = gr.Button("Ask")
356
+ answer = gr.Textbox(label="Answer", lines=10)
 
 
 
 
 
357
 
358
+ ingest_btn.click(ingest_sources, [file_input, gr.State("")], status)
359
+ ask_btn.click(ask_prompt, query_input, answer)
360
 
 
 
 
361
  if __name__ == "__main__":
362
+ demo.launch()