Morinash commited on
Commit
d6c6abe
Β·
verified Β·
1 Parent(s): 179339d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -141
app.py CHANGED
@@ -14,6 +14,19 @@ import numpy as np
14
  from transformers import pipeline
15
  import traceback
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # ==============================
18
  # CONFIG
19
  # ==============================
@@ -22,258 +35,264 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
22
  INDEX_PATH = "faiss_index.index"
23
  METADATA_PATH = "metadata.json"
24
 
25
- # Load embedding model
26
  embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
27
 
28
  # ==============================
29
- # Enhanced text extractors with debugging
30
  # ==============================
31
- def extract_text_from_pdf(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  try:
33
- print(f"Processing PDF: {file_path}")
34
  reader = PdfReader(file_path)
35
- pages = []
36
  for i, page in enumerate(reader.pages):
37
- text = page.extract_text()
38
- if text and text.strip():
39
- pages.append(f"Page {i+1}:\n{text}")
40
- else:
41
- print(f"Warning: Page {i+1} has no extractable text")
42
- result = "\n\n".join(pages)
43
- print(f"PDF extracted {len(pages)} pages, {len(result)} chars")
44
- return result if result.strip() else "No text found in PDF (possibly scanned image)"
45
  except Exception as e:
46
- print(f"PDF error: {str(e)}")
47
- return f"PDF error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
49
  def extract_text_from_docx(file_path):
50
  try:
51
- print(f"Processing DOCX: {file_path}")
52
  doc = Document(file_path)
53
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
54
- result = "\n\n".join(paragraphs)
55
- print(f"DOCX extracted {len(paragraphs)} paragraphs, {len(result)} chars")
56
- return result if result.strip() else "No text found in DOCX"
57
  except Exception as e:
58
- print(f"DOCX error: {str(e)}")
59
  return f"DOCX error: {str(e)}"
60
 
61
  def extract_text_from_excel(file_path):
62
  try:
63
- print(f"Processing Excel: {file_path}")
64
- # Try first sheet only for speed
65
- df = pd.read_excel(file_path, sheet_name=0)
66
- result = f"Sheet: {df.shape}\n{df.fillna('').to_string()}"
67
- print(f"Excel extracted {df.shape[0]} rows, {len(result)} chars")
68
- return result
69
  except Exception as e:
70
- print(f"Excel error: {str(e)}")
71
  return f"Excel error: {str(e)}"
72
 
73
  def extract_text_from_txt(file_path):
74
  try:
75
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
76
  return f.read()
77
- except Exception as e:
78
- return f"TXT error: {str(e)}"
 
79
 
80
  def extract_text_from_url(url):
81
  try:
82
  r = requests.get(url, timeout=10)
83
- soup = BeautifulSoup(r.text, "html.parser") # Use html.parser as fallback
84
  for s in soup(["script", "style"]):
85
  s.decompose()
86
  text = soup.get_text(separator="\n", strip=True)
87
- return text[:5000] # Limit length
88
  except Exception as e:
89
  return f"URL error: {str(e)}"
90
 
91
  # ==============================
92
- # Text chunking
93
  # ==============================
94
  splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
95
 
96
  # ==============================
97
- # Enhanced ingestion with debug output
98
  # ==============================
99
  def ingest_sources(files, urls):
100
  docs = []
101
  metadata = []
102
  debug_info = []
103
-
104
- # Clear existing index for fresh start during testing
105
- if os.path.exists(INDEX_PATH):
106
- os.remove(INDEX_PATH)
107
- debug_info.append("Cleared existing index")
108
- if os.path.exists(METADATA_PATH):
109
- os.remove(METADATA_PATH)
110
- debug_info.append("Cleared existing metadata")
111
-
112
- # Process files
113
- processed_files = 0
114
  for f in files or []:
115
- processed_files += 1
116
- name = getattr(f, "name", f"file_{processed_files}")
117
- debug_info.append(f"Processing: {name}")
118
 
119
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
120
  try:
121
- # Handle different file types from Gradio
122
  data = None
123
  if hasattr(f, 'read'):
124
  data = f.read()
 
125
  elif isinstance(f, str):
126
  data = f.encode('utf-8')
127
  elif isinstance(f, dict) and 'data' in f:
128
  data = f['data']
129
- if isinstance(data, str):
130
- data = data.encode('utf-8')
131
 
132
- if data is None:
133
- debug_info.append(f"Failed: Could not read {name}")
134
  continue
135
 
136
  tmp.write(data)
137
  tmp.flush()
138
 
139
- # Extract based on extension
140
  ext = os.path.splitext(name.lower())[1]
141
  if ext == '.pdf':
142
- text = extract_text_from_pdf(tmp.name)
143
  elif ext == '.docx':
144
  text = extract_text_from_docx(tmp.name)
145
  elif ext in ['.xls', '.xlsx']:
146
  text = extract_text_from_excel(tmp.name)
147
- elif ext in ['.txt', '.md']:
148
- text = extract_text_from_txt(tmp.name)
149
  else:
150
  text = extract_text_from_txt(tmp.name)
151
-
152
- debug_info.append(f"Extracted {len(text)} chars from {name}")
153
 
154
- if len(text) > 50 and "error" not in text.lower():
 
 
 
155
  chunks = splitter.split_text(text)
156
- for i, chunk in enumerate(chunks):
157
- if len(chunk.strip()) > 10:
158
- docs.append(chunk)
159
- metadata.append({
160
- "source": name,
161
- "chunk": i,
162
- "type": "file",
163
- "text": chunk
164
- })
165
- debug_info.append(f"Created {len(chunks)} chunks from {name}")
166
  else:
167
- debug_info.append(f"Skipped {name}: too short or error")
168
 
169
  except Exception as e:
170
- debug_info.append(f"Error processing {name}: {str(e)}")
171
  finally:
172
- try:
173
- os.unlink(tmp.name)
174
- except:
175
- pass
176
-
177
- # Process URLs
178
- processed_urls = 0
179
  for url in (urls or "").splitlines():
180
- url = url.strip()
181
- if url:
182
- processed_urls += 1
183
- debug_info.append(f"Fetching URL: {url}")
184
- text = extract_text_from_url(url)
185
  if len(text) > 100 and "error" not in text.lower():
186
  chunks = splitter.split_text(text)
187
- for i, chunk in enumerate(chunks):
188
- if len(chunk.strip()) > 10:
189
- docs.append(chunk)
190
- metadata.append({
191
- "source": url,
192
- "chunk": i,
193
- "type": "url",
194
- "text": chunk
195
- })
196
- debug_info.append(f"Created {len(chunks)} chunks from {url}")
197
- else:
198
- debug_info.append(f"Skipped URL {url}: insufficient content")
199
-
200
- debug_info.append(f"Total chunks created: {len(docs)}")
201
 
202
  if not docs:
203
- return "❌ No valid text extracted.\n\nDebug info:\n" + "\n".join(debug_info[:10])
204
 
205
- # Create embeddings and index
206
  try:
207
- print(f"Creating embeddings for {len(docs)} chunks...")
208
- embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
209
- dim = embeddings.shape[1]
210
- index = faiss.IndexFlatL2(dim)
211
  index.add(embeddings)
212
-
213
  faiss.write_index(index, INDEX_PATH)
214
  with open(METADATA_PATH, "w", encoding="utf-8") as f:
215
- json.dump(metadata, f, ensure_ascii=False, indent=2)
216
-
217
- return f"βœ… Success! Ingested {len(docs)} chunks.\n\nDebug: {len(files or [])} files, {processed_urls} URLs processed."
218
  except Exception as e:
219
- return f"❌ Indexing failed: {str(e)}\n\nDebug info:\n" + "\n".join(debug_info)
220
 
221
  # ==============================
222
- # Retrieval
223
  # ==============================
224
- def retrieve_topk(query, k=5):
225
- if not os.path.exists(INDEX_PATH):
226
- return []
227
  q_emb = embed_model.encode([query])
228
  index = faiss.read_index(INDEX_PATH)
229
  D, I = index.search(q_emb, k)
230
-
231
- with open(METADATA_PATH, "r", encoding="utf-8") as f:
232
  metadata = json.load(f)
233
-
234
  return [metadata[idx] for idx in I[0] if idx < len(metadata)]
235
 
236
- # ==============================
237
- # Generation
238
- # ==============================
239
  gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
240
 
241
  def ask_prompt(prompt):
242
- hits = retrieve_topk(prompt, k=3)
243
- if not hits:
244
- return "No documents ingested or no relevant matches found."
245
 
246
- context = "\n\n".join([h.get("text", "")[:1000] for h in hits]) # Limit context length
247
  sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
248
 
249
- full_prompt = f"Context:\n{context}\n\nQuestion: {prompt}\n\nAnswer:"
250
-
251
- try:
252
- result = gen_pipeline(full_prompt, max_length=300, do_sample=False)[0]["generated_text"]
253
- return f"{result}\n\n**Sources:**\n" + "\n".join(sources)
254
- except Exception as e:
255
- return f"Generation error: {str(e)}"
256
 
257
  # ==============================
258
- # Gradio UI
259
  # ==============================
260
  with gr.Blocks() as demo:
261
- gr.Markdown("# πŸ” Research Assistant\nUpload files and click **Ingest** to see debug info.")
262
-
263
  with gr.Row():
264
  with gr.Column():
265
- file_in = gr.File(label="Upload files", file_count="multiple")
266
- urls_in = gr.Textbox(label="URLs (one per line)", placeholder="https://example.com")
267
  ingest_btn = gr.Button("Ingest", variant="primary")
268
- ingest_output = gr.Textbox(label="Status & Debug Info", lines=8)
269
-
270
  with gr.Column():
271
- prompt_in = gr.Textbox(label="Ask a question", lines=3)
272
  ask_btn = gr.Button("Ask")
273
- answer_out = gr.Textbox(label="Answer", lines=10)
274
 
275
- ingest_btn.click(ingest_sources, inputs=[file_in, urls_in], outputs=ingest_output)
276
- ask_btn.click(ask_prompt, inputs=prompt_in, outputs=answer_out)
277
 
278
  if __name__ == "__main__":
279
  demo.launch()
 
14
  from transformers import pipeline
15
  import traceback
16
 
17
+ # Try multiple PDF libraries
18
+ try:
19
+ from PyPDF2 import PdfReader as PyPDF2Reader
20
+ HAS_PYPDF2 = True
21
+ except ImportError:
22
+ HAS_PYPDF2 = False
23
+
24
+ try:
25
+ import pdfplumber
26
+ HAS_PDFPLUMBER = True
27
+ except ImportError:
28
+ HAS_PDFPLUMBER = False
29
+
30
  # ==============================
31
  # CONFIG
32
  # ==============================
 
35
  INDEX_PATH = "faiss_index.index"
36
  METADATA_PATH = "metadata.json"
37
 
 
38
  embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
39
 
40
  # ==============================
41
+ # ROBUST PDF EXTRACTION
42
  # ==============================
43
+ def extract_text_from_pdf_robust(file_path):
44
+ """Try multiple PDF extraction methods"""
45
+ methods_tried = []
46
+
47
+ # Method 1: pdfplumber (best for tables/forms)
48
+ if HAS_PDFPLUMBER:
49
+ try:
50
+ methods_tried.append("pdfplumber")
51
+ with pdfplumber.open(file_path) as pdf:
52
+ text = ""
53
+ for i, page in enumerate(pdf.pages):
54
+ page_text = page.extract_text()
55
+ if page_text:
56
+ text += f"\n--- Page {i+1} ---\n{page_text}"
57
+ if len(text.strip()) > 50:
58
+ print(f"pdfplumber success: {len(text)} chars")
59
+ return text
60
+ except Exception as e:
61
+ print(f"pdfplumber failed: {e}")
62
+
63
+ # Method 2: pypdf (original)
64
  try:
65
+ methods_tried.append("pypdf")
66
  reader = PdfReader(file_path)
67
+ text = ""
68
  for i, page in enumerate(reader.pages):
69
+ page_text = page.extract_text()
70
+ if page_text and page_text.strip():
71
+ text += f"\n--- Page {i+1} ---\n{page_text}"
72
+ if len(text.strip()) > 50:
73
+ print(f"pypdf success: {len(text)} chars")
74
+ return text
75
+ print(f"pypdf extracted only {len(text)} chars")
 
76
  except Exception as e:
77
+ print(f"pypdf failed: {e}")
78
+
79
+ # Method 3: PyPDF2 fallback
80
+ if HAS_PYPDF2:
81
+ try:
82
+ methods_tried.append("PyPDF2")
83
+ reader = PyPDF2Reader(file_path)
84
+ text = ""
85
+ for i, page in enumerate(reader.pages):
86
+ page_text = page.extract_text()
87
+ if page_text and page_text.strip():
88
+ text += f"\n--- Page {i+1} ---\n{page_text}"
89
+ if len(text.strip()) > 50:
90
+ print(f"PyPDF2 success: {len(text)} chars")
91
+ return text
92
+ except Exception as e:
93
+ print(f"PyPDF2 failed: {e}")
94
+
95
+ # Method 4: Raw bytes check (detect encrypted/scanned)
96
+ try:
97
+ with open(file_path, 'rb') as f:
98
+ content = f.read(1024)
99
+ if b'/Encrypt' in content:
100
+ return "PDF is encrypted/protected. Please remove password."
101
+ if len(content) < 10000:
102
+ return "PDF appears to be scanned images (no text layer). Try OCR tools."
103
+ except:
104
+ pass
105
+
106
+ return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned PDF or protected."
107
 
108
+ # ==============================
109
+ # Other extractors (keep simple)
110
+ # ==============================
111
  def extract_text_from_docx(file_path):
112
  try:
 
113
  doc = Document(file_path)
114
+ paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
115
+ return "\n\n".join(paragraphs) if paragraphs else "No text in DOCX"
 
 
116
  except Exception as e:
 
117
  return f"DOCX error: {str(e)}"
118
 
119
  def extract_text_from_excel(file_path):
120
  try:
121
+ df = pd.read_excel(file_path, sheet_name=0, nrows=100) # Limit rows
122
+ return f"Sheet preview:\n{df.fillna('').to_csv(index=False)}"
 
 
 
 
123
  except Exception as e:
 
124
  return f"Excel error: {str(e)}"
125
 
126
  def extract_text_from_txt(file_path):
127
  try:
128
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
129
  return f.read()
130
+ except:
131
+ with open(file_path, "r", encoding="latin-1", errors="ignore") as f:
132
+ return f.read()
133
 
134
  def extract_text_from_url(url):
135
  try:
136
  r = requests.get(url, timeout=10)
137
+ soup = BeautifulSoup(r.text, "html.parser")
138
  for s in soup(["script", "style"]):
139
  s.decompose()
140
  text = soup.get_text(separator="\n", strip=True)
141
+ return text[:3000] # Limit
142
  except Exception as e:
143
  return f"URL error: {str(e)}"
144
 
145
  # ==============================
146
+ # Chunking
147
  # ==============================
148
  splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
149
 
150
  # ==============================
151
+ # SIMPLIFIED INGESTION (focus on PDF fix)
152
  # ==============================
153
  def ingest_sources(files, urls):
154
  docs = []
155
  metadata = []
156
  debug_info = []
157
+
158
+ # Clear old index for testing
159
+ for path in [INDEX_PATH, METADATA_PATH]:
160
+ if os.path.exists(path):
161
+ os.remove(path)
162
+ debug_info.append(f"Cleared {path}")
163
+
164
+ processed = 0
 
 
 
165
  for f in files or []:
166
+ processed += 1
167
+ name = getattr(f, "name", f"file_{processed}")
168
+ debug_info.append(f"\nπŸ” Processing: {name}")
169
 
170
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
171
  try:
172
+ # Handle Gradio file formats
173
  data = None
174
  if hasattr(f, 'read'):
175
  data = f.read()
176
+ if isinstance(data, str): data = data.encode('utf-8')
177
  elif isinstance(f, str):
178
  data = f.encode('utf-8')
179
  elif isinstance(f, dict) and 'data' in f:
180
  data = f['data']
181
+ if isinstance(data, str): data = data.encode('utf-8')
 
182
 
183
+ if not data:
184
+ debug_info.append("❌ Could not read file data")
185
  continue
186
 
187
  tmp.write(data)
188
  tmp.flush()
189
 
190
+ # Extract text
191
  ext = os.path.splitext(name.lower())[1]
192
  if ext == '.pdf':
193
+ text = extract_text_from_pdf_robust(tmp.name)
194
  elif ext == '.docx':
195
  text = extract_text_from_docx(tmp.name)
196
  elif ext in ['.xls', '.xlsx']:
197
  text = extract_text_from_excel(tmp.name)
 
 
198
  else:
199
  text = extract_text_from_txt(tmp.name)
 
 
200
 
201
+ debug_info.append(f"πŸ“„ Extracted {len(text)} characters")
202
+
203
+ # Lower threshold for "valid" content
204
+ if len(text) > 20 and "error" not in text.lower() and "no text" not in text.lower():
205
  chunks = splitter.split_text(text)
206
+ valid_chunks = [c for c in chunks if len(c.strip()) > 20]
207
+ for i, chunk in enumerate(valid_chunks):
208
+ docs.append(chunk)
209
+ metadata.append({
210
+ "source": name,
211
+ "chunk": i,
212
+ "type": "file",
213
+ "text": chunk
214
+ })
215
+ debug_info.append(f"βœ… Created {len(valid_chunks)} chunks")
216
  else:
217
+ debug_info.append(f"⚠️ Skipped: {'too short' if len(text) <= 20 else 'contains error'}")
218
 
219
  except Exception as e:
220
+ debug_info.append(f"πŸ’₯ Error: {str(e)}")
221
  finally:
222
+ try: os.unlink(tmp.name)
223
+ except: pass
224
+
225
+ # URLs (simplified)
 
 
 
226
  for url in (urls or "").splitlines():
227
+ if url.strip():
228
+ text = extract_text_from_url(url.strip())
 
 
 
229
  if len(text) > 100 and "error" not in text.lower():
230
  chunks = splitter.split_text(text)
231
+ for i, c in enumerate(chunks):
232
+ if len(c.strip()) > 20:
233
+ docs.append(c)
234
+ metadata.append({"source": url, "chunk": i, "type": "url", "text": c})
235
+
236
+ debug_info.append(f"\nπŸ“Š Total: {len(docs)} chunks created")
 
 
 
 
 
 
 
 
237
 
238
  if not docs:
239
+ return "❌ No valid content.\n\n" + "\n".join(debug_info)
240
 
241
+ # Build index
242
  try:
243
+ embeddings = embed_model.encode(docs, show_progress_bar=False)
244
+ index = faiss.IndexFlatL2(embeddings.shape[1])
 
 
245
  index.add(embeddings)
 
246
  faiss.write_index(index, INDEX_PATH)
247
  with open(METADATA_PATH, "w", encoding="utf-8") as f:
248
+ json.dump(metadata, f, ensure_ascii=False)
249
+ return f"βœ… SUCCESS: {len(docs)} chunks ingested!\n\n" + "\n".join(debug_info[-5:])
 
250
  except Exception as e:
251
+ return f"❌ Index failed: {str(e)}\n\n" + "\n".join(debug_info)
252
 
253
  # ==============================
254
+ # Keep retrieval and generation simple
255
  # ==============================
256
+ def retrieve_topk(query, k=3):
257
+ if not os.path.exists(INDEX_PATH): return []
 
258
  q_emb = embed_model.encode([query])
259
  index = faiss.read_index(INDEX_PATH)
260
  D, I = index.search(q_emb, k)
261
+ with open(METADATA_PATH, "r") as f:
 
262
  metadata = json.load(f)
 
263
  return [metadata[idx] for idx in I[0] if idx < len(metadata)]
264
 
 
 
 
265
  gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
266
 
267
  def ask_prompt(prompt):
268
+ hits = retrieve_topk(prompt)
269
+ if not hits: return "No documents ingested."
 
270
 
271
+ context = "\n\n".join([h.get("text", "")[:800] for h in hits])
272
  sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
273
 
274
+ full_prompt = f"Context:\n{context}\n\nQ: {prompt}\nA:"
275
+ result = gen_pipeline(full_prompt, max_length=300)[0]["generated_text"]
276
+ return f"{result}\n\nSources:\n" + "\n".join(sources)
 
 
 
 
277
 
278
  # ==============================
279
+ # UI
280
  # ==============================
281
  with gr.Blocks() as demo:
282
+ gr.Markdown("# πŸ” Research Assistant - Debug Mode")
 
283
  with gr.Row():
284
  with gr.Column():
285
+ file_in = gr.File(file_count="multiple")
286
+ urls_in = gr.Textbox(label="URLs", placeholder="https://...")
287
  ingest_btn = gr.Button("Ingest", variant="primary")
288
+ ingest_out = gr.Textbox(label="Debug Output", lines=10)
 
289
  with gr.Column():
290
+ prompt_in = gr.Textbox(label="Question", lines=3)
291
  ask_btn = gr.Button("Ask")
292
+ answer_out = gr.Textbox(lines=10)
293
 
294
+ ingest_btn.click(ingest_sources, [file_in, urls_in], ingest_out)
295
+ ask_btn.click(ask_prompt, prompt_in, answer_out)
296
 
297
  if __name__ == "__main__":
298
  demo.launch()