Morinash commited on
Commit
2d0f5ab
Β·
verified Β·
1 Parent(s): 26c15de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +346 -196
app.py CHANGED
@@ -13,19 +13,11 @@ import faiss
13
  import numpy as np
14
  from transformers import pipeline
15
  import traceback
 
16
 
17
- # Try multiple PDF libraries
18
- try:
19
- from PyPDF2 import PdfReader as PyPDF2Reader
20
- HAS_PYPDF2 = True
21
- except ImportError:
22
- HAS_PYPDF2 = False
23
-
24
- try:
25
- import pdfplumber
26
- HAS_PDFPLUMBER = True
27
- except ImportError:
28
- HAS_PDFPLUMBER = False
29
 
30
  # ==============================
31
  # CONFIG
@@ -35,264 +27,422 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
35
  INDEX_PATH = "faiss_index.index"
36
  METADATA_PATH = "metadata.json"
37
 
38
- embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # ==============================
41
- # ROBUST PDF EXTRACTION
42
  # ==============================
43
- def extract_text_from_pdf_robust(file_path):
44
- """Try multiple PDF extraction methods"""
45
- methods_tried = []
46
-
47
- # Method 1: pdfplumber (best for tables/forms)
48
- if HAS_PDFPLUMBER:
49
- try:
50
- methods_tried.append("pdfplumber")
51
- with pdfplumber.open(file_path) as pdf:
52
- text = ""
53
- for i, page in enumerate(pdf.pages):
54
- page_text = page.extract_text()
55
- if page_text:
56
- text += f"\n--- Page {i+1} ---\n{page_text}"
57
- if len(text.strip()) > 50:
58
- print(f"pdfplumber success: {len(text)} chars")
59
- return text
60
- except Exception as e:
61
- print(f"pdfplumber failed: {e}")
62
-
63
- # Method 2: pypdf (original)
64
  try:
65
- methods_tried.append("pypdf")
66
  reader = PdfReader(file_path)
67
  text = ""
68
- for i, page in enumerate(reader.pages):
69
- page_text = page.extract_text()
70
- if page_text and page_text.strip():
71
- text += f"\n--- Page {i+1} ---\n{page_text}"
72
- if len(text.strip()) > 50:
73
- print(f"pypdf success: {len(text)} chars")
74
- return text
75
- print(f"pypdf extracted only {len(text)} chars")
76
- except Exception as e:
77
- print(f"pypdf failed: {e}")
78
-
79
- # Method 3: PyPDF2 fallback
80
- if HAS_PYPDF2:
81
- try:
82
- methods_tried.append("PyPDF2")
83
- reader = PyPDF2Reader(file_path)
84
- text = ""
85
- for i, page in enumerate(reader.pages):
86
  page_text = page.extract_text()
87
  if page_text and page_text.strip():
88
- text += f"\n--- Page {i+1} ---\n{page_text}"
89
- if len(text.strip()) > 50:
90
- print(f"PyPDF2 success: {len(text)} chars")
91
- return text
92
- except Exception as e:
93
- print(f"PyPDF2 failed: {e}")
94
-
95
- # Method 4: Raw bytes check (detect encrypted/scanned)
96
- try:
97
- with open(file_path, 'rb') as f:
98
- content = f.read(1024)
99
- if b'/Encrypt' in content:
100
- return "PDF is encrypted/protected. Please remove password."
101
- if len(content) < 10000:
102
- return "PDF appears to be scanned images (no text layer). Try OCR tools."
103
- except:
104
- pass
105
-
106
- return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned PDF or protected."
107
 
108
- # ==============================
109
- # Other extractors (keep simple)
110
- # ==============================
111
  def extract_text_from_docx(file_path):
 
112
  try:
113
  doc = Document(file_path)
114
  paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
115
- return "\n\n".join(paragraphs) if paragraphs else "No text in DOCX"
 
 
116
  except Exception as e:
 
117
  return f"DOCX error: {str(e)}"
118
 
119
  def extract_text_from_excel(file_path):
 
120
  try:
121
- df = pd.read_excel(file_path, sheet_name=0, nrows=100) # Limit rows
122
- return f"Sheet preview:\n{df.fillna('').to_csv(index=False)}"
 
 
123
  except Exception as e:
 
124
  return f"Excel error: {str(e)}"
125
 
126
  def extract_text_from_txt(file_path):
127
- try:
128
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
129
- return f.read()
130
- except:
131
- with open(file_path, "r", encoding="latin-1", errors="ignore") as f:
132
- return f.read()
 
 
 
133
 
134
  def extract_text_from_url(url):
 
135
  try:
136
- r = requests.get(url, timeout=10)
137
- soup = BeautifulSoup(r.text, "html.parser")
138
- for s in soup(["script", "style"]):
139
- s.decompose()
140
- text = soup.get_text(separator="\n", strip=True)
141
- return text[:3000] # Limit
 
 
 
 
 
 
142
  except Exception as e:
 
143
  return f"URL error: {str(e)}"
144
 
145
  # ==============================
146
- # Chunking
147
- # ==============================
148
- splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
149
-
150
- # ==============================
151
- # SIMPLIFIED INGESTION (focus on PDF fix)
152
  # ==============================
153
  def ingest_sources(files, urls):
 
154
  docs = []
155
  metadata = []
156
  debug_info = []
157
 
158
- # Clear old index for testing
159
  for path in [INDEX_PATH, METADATA_PATH]:
160
  if os.path.exists(path):
161
  os.remove(path)
162
- debug_info.append(f"Cleared {path}")
163
 
164
- processed = 0
 
165
  for f in files or []:
166
- processed += 1
167
- name = getattr(f, "name", f"file_{processed}")
168
- debug_info.append(f"\nπŸ” Processing: {name}")
169
-
170
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
171
  try:
172
- # Handle Gradio file formats
173
- data = None
174
- if hasattr(f, 'read'):
175
- data = f.read()
176
- if isinstance(data, str): data = data.encode('utf-8')
177
- elif isinstance(f, str):
178
- data = f.encode('utf-8')
179
- elif isinstance(f, dict) and 'data' in f:
180
- data = f['data']
181
- if isinstance(data, str): data = data.encode('utf-8')
182
 
183
- if not data:
184
- debug_info.append("❌ Could not read file data")
185
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- tmp.write(data)
188
- tmp.flush()
 
189
 
190
- # Extract text
191
  ext = os.path.splitext(name.lower())[1]
 
 
192
  if ext == '.pdf':
193
- text = extract_text_from_pdf_robust(tmp.name)
194
- elif ext == '.docx':
195
- text = extract_text_from_docx(tmp.name)
196
- elif ext in ['.xls', '.xlsx']:
197
- text = extract_text_from_excel(tmp.name)
198
  else:
199
- text = extract_text_from_txt(tmp.name)
200
 
201
- debug_info.append(f"πŸ“„ Extracted {len(text)} characters")
 
 
 
 
 
202
 
203
- # Lower threshold for "valid" content
204
- if len(text) > 20 and "error" not in text.lower() and "no text" not in text.lower():
205
  chunks = splitter.split_text(text)
206
- valid_chunks = [c for c in chunks if len(c.strip()) > 20]
 
207
  for i, chunk in enumerate(valid_chunks):
208
  docs.append(chunk)
209
  metadata.append({
210
- "source": name,
211
- "chunk": i,
212
- "type": "file",
213
- "text": chunk
214
  })
 
215
  debug_info.append(f"βœ… Created {len(valid_chunks)} chunks")
216
  else:
217
- debug_info.append(f"⚠️ Skipped: {'too short' if len(text) <= 20 else 'contains error'}")
 
 
 
 
 
 
218
 
219
  except Exception as e:
220
- debug_info.append(f"πŸ’₯ Error: {str(e)}")
221
- finally:
222
- try: os.unlink(tmp.name)
223
- except: pass
224
 
225
- # URLs (simplified)
226
- for url in (urls or "").splitlines():
227
- if url.strip():
228
- text = extract_text_from_url(url.strip())
229
- if len(text) > 100 and "error" not in text.lower():
230
- chunks = splitter.split_text(text)
231
- for i, c in enumerate(chunks):
232
- if len(c.strip()) > 20:
233
- docs.append(c)
234
- metadata.append({"source": url, "chunk": i, "type": "url", "text": c})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- debug_info.append(f"\nπŸ“Š Total: {len(docs)} chunks created")
237
 
238
  if not docs:
239
- return "❌ No valid content.\n\n" + "\n".join(debug_info)
240
 
241
- # Build index
242
  try:
243
- embeddings = embed_model.encode(docs, show_progress_bar=False)
244
- index = faiss.IndexFlatL2(embeddings.shape[1])
245
- index.add(embeddings)
 
 
 
 
246
  faiss.write_index(index, INDEX_PATH)
247
- with open(METADATA_PATH, "w", encoding="utf-8") as f:
248
- json.dump(metadata, f, ensure_ascii=False)
249
- return f"βœ… SUCCESS: {len(docs)} chunks ingested!\n\n" + "\n".join(debug_info[-5:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  except Exception as e:
251
- return f"❌ Index failed: {str(e)}\n\n" + "\n".join(debug_info)
 
252
 
253
  # ==============================
254
- # Keep retrieval and generation simple
255
  # ==============================
256
- def retrieve_topk(query, k=3):
257
- if not os.path.exists(INDEX_PATH): return []
258
- q_emb = embed_model.encode([query])
259
- index = faiss.read_index(INDEX_PATH)
260
- D, I = index.search(q_emb, k)
261
- with open(METADATA_PATH, "r") as f:
262
- metadata = json.load(f)
263
- return [metadata[idx] for idx in I[0] if idx < len(metadata)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 
266
 
267
- def ask_prompt(prompt):
268
- hits = retrieve_topk(prompt)
269
- if not hits: return "No documents ingested."
270
-
271
- context = "\n\n".join([h.get("text", "")[:800] for h in hits])
272
- sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
273
-
274
- full_prompt = f"Context:\n{context}\n\nQ: {prompt}\nA:"
275
- result = gen_pipeline(full_prompt, max_length=300)[0]["generated_text"]
276
- return f"{result}\n\nSources:\n" + "\n".join(sources)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  # ==============================
279
- # UI
280
  # ==============================
281
- with gr.Blocks() as demo:
282
- gr.Markdown("# πŸ” Research Assistant - Debug Mode")
283
- with gr.Row():
284
- with gr.Column():
285
- file_in = gr.File(file_count="multiple")
286
- urls_in = gr.Textbox(label="URLs", placeholder="https://...")
287
- ingest_btn = gr.Button("Ingest", variant="primary")
288
- ingest_out = gr.Textbox(label="Debug Output", lines=10)
289
- with gr.Column():
290
- prompt_in = gr.Textbox(label="Question", lines=3)
291
- ask_btn = gr.Button("Ask")
292
- answer_out = gr.Textbox(lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- ingest_btn.click(ingest_sources, [file_in, urls_in], ingest_out)
295
- ask_btn.click(ask_prompt, prompt_in, answer_out)
296
 
 
 
 
297
  if __name__ == "__main__":
298
- demo.launch()
 
 
 
 
 
 
 
13
  import numpy as np
14
  from transformers import pipeline
15
  import traceback
16
+ import logging
17
 
18
+ # Set up logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
21
 
22
  # ==============================
23
  # CONFIG
 
27
  INDEX_PATH = "faiss_index.index"
28
  METADATA_PATH = "metadata.json"
29
 
30
+ # Global variables
31
+ embed_model = None
32
+ splitter = None
33
+ gen_pipeline = None
34
+
35
+ def initialize_models():
36
+ global embed_model, splitter, gen_pipeline
37
+ try:
38
+ embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
39
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
40
+ gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
41
+ logger.info("Models initialized successfully")
42
+ return True
43
+ except Exception as e:
44
+ logger.error(f"Model initialization failed: {e}")
45
+ return False
46
+
47
+ # Initialize on startup
48
+ if not initialize_models():
49
+ raise RuntimeError("Failed to initialize models")
50
 
51
  # ==============================
52
+ # FILE EXTRACTION FUNCTIONS
53
  # ==============================
54
+ def extract_text_from_pdf(file_path):
55
+ """Extract text from PDF using pypdf"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  try:
57
+ logger.info(f"Extracting PDF from: {file_path}")
58
  reader = PdfReader(file_path)
59
  text = ""
60
+ page_count = len(reader.pages)
61
+
62
+ # Extract from first few pages only for speed
63
+ for i, page in enumerate(reader.pages[:5]):
64
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  page_text = page.extract_text()
66
  if page_text and page_text.strip():
67
+ text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
68
+ except Exception as e:
69
+ logger.warning(f"Failed to extract page {i+1}: {e}")
70
+ continue
71
+
72
+ logger.info(f"PDF extraction complete: {len(text)} characters")
73
+ return text.strip()
74
+
75
+ except Exception as e:
76
+ logger.error(f"PDF extraction error: {e}")
77
+ return f"PDF extraction failed: {str(e)}"
 
 
 
 
 
 
 
 
78
 
 
 
 
79
  def extract_text_from_docx(file_path):
80
+ """Extract text from DOCX"""
81
  try:
82
  doc = Document(file_path)
83
  paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
84
+ text = "\n\n".join(paragraphs)
85
+ logger.info(f"DOCX extraction: {len(paragraphs)} paragraphs")
86
+ return text
87
  except Exception as e:
88
+ logger.error(f"DOCX extraction error: {e}")
89
  return f"DOCX error: {str(e)}"
90
 
91
  def extract_text_from_excel(file_path):
92
+ """Extract text from Excel (first sheet preview)"""
93
  try:
94
+ df = pd.read_excel(file_path, sheet_name=0, nrows=50) # Limit rows
95
+ text = f"Excel Sheet Preview ({df.shape[0]} rows):\n\n{df.fillna('').to_string(index=False)}"
96
+ logger.info(f"Excel extraction: {df.shape}")
97
+ return text
98
  except Exception as e:
99
+ logger.error(f"Excel extraction error: {e}")
100
  return f"Excel error: {str(e)}"
101
 
102
  def extract_text_from_txt(file_path):
103
+ """Extract text from plain text files"""
104
+ encodings = ['utf-8', 'latin-1', 'cp1252']
105
+ for encoding in encodings:
106
+ try:
107
+ with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
108
+ return f.read()
109
+ except:
110
+ continue
111
+ return "Could not read text file with available encodings"
112
 
113
  def extract_text_from_url(url):
114
+ """Extract text from URL"""
115
  try:
116
+ headers = {'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'}
117
+ r = requests.get(url, timeout=15, headers=headers)
118
+ r.raise_for_status()
119
+
120
+ soup = BeautifulSoup(r.text, 'html.parser')
121
+ for script in soup(["script", "style", "nav", "footer"]):
122
+ script.decompose()
123
+
124
+ text = soup.get_text(separator='\n', strip=True)
125
+ # Clean up excessive whitespace
126
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
127
+ return '\n'.join(lines[:100]) # Limit lines
128
  except Exception as e:
129
+ logger.error(f"URL extraction error: {e}")
130
  return f"URL error: {str(e)}"
131
 
132
  # ==============================
133
+ # MAIN INGESTION FUNCTION
 
 
 
 
 
134
  # ==============================
135
  def ingest_sources(files, urls):
136
+ """Process files and URLs, create embeddings index"""
137
  docs = []
138
  metadata = []
139
  debug_info = []
140
 
141
+ # Clear existing index for fresh ingestion
142
  for path in [INDEX_PATH, METADATA_PATH]:
143
  if os.path.exists(path):
144
  os.remove(path)
145
+ debug_info.append(f"πŸ—‘οΈ Cleared existing {os.path.basename(path)}")
146
 
147
+ # Process files
148
+ processed_files = 0
149
  for f in files or []:
150
+ processed_files += 1
 
 
 
 
151
  try:
152
+ # Get filename
153
+ name = getattr(f, 'name', f'file_{processed_files}')
154
+ if not name:
155
+ name = f'uploaded_file_{processed_files}'
 
 
 
 
 
 
156
 
157
+ debug_info.append(f"\nπŸ“ Processing: {os.path.basename(name)}")
158
+
159
+ # Create temp file
160
+ suffix = os.path.splitext(name)[1] or '.txt'
161
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir='/tmp') as tmp:
162
+ # Handle different Gradio file formats
163
+ file_data = None
164
+ if hasattr(f, 'read'):
165
+ file_data = f.read()
166
+ if isinstance(file_data, str):
167
+ file_data = file_data.encode('utf-8')
168
+ elif isinstance(f, dict) and 'data' in f:
169
+ file_data = f['data']
170
+ if isinstance(file_data, str):
171
+ file_data = file_data.encode('utf-8')
172
+ elif isinstance(f, str):
173
+ file_data = f.encode('utf-8')
174
+
175
+ if not file_data:
176
+ debug_info.append("❌ No file data available")
177
+ continue
178
 
179
+ tmp.write(file_data)
180
+ tmp_path = tmp.name
181
+ tmp.flush()
182
 
183
+ # Extract text based on extension
184
  ext = os.path.splitext(name.lower())[1]
185
+ text = ""
186
+
187
  if ext == '.pdf':
188
+ text = extract_text_from_pdf(tmp_path)
189
+ elif ext in ['.doc', '.docx']:
190
+ text = extract_text_from_docx(tmp_path)
191
+ elif ext in ['.xls', '.xlsx', '.csv']:
192
+ text = extract_text_from_excel(tmp_path)
193
  else:
194
+ text = extract_text_from_txt(tmp_path)
195
 
196
+ # Show preview of extracted content
197
+ preview = text[:200].replace('\n', ' ').strip()
198
+ if len(preview) > 100:
199
+ preview = preview[:100] + "..."
200
+ debug_info.append(f"πŸ“„ Extracted {len(text)} chars")
201
+ debug_info.append(f"πŸ” Preview: '{preview}'")
202
 
203
+ # Create chunks if we have substantial content
204
+ if len(text.strip()) > 30 and not text.startswith(('error', 'PDF extraction failed')):
205
  chunks = splitter.split_text(text)
206
+ valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
207
+
208
  for i, chunk in enumerate(valid_chunks):
209
  docs.append(chunk)
210
  metadata.append({
211
+ "source": os.path.basename(name),
212
+ "chunk_id": i,
213
+ "type": "file",
214
+ "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
215
  })
216
+
217
  debug_info.append(f"βœ… Created {len(valid_chunks)} chunks")
218
  else:
219
+ debug_info.append("⚠️ Skipped: insufficient content or extraction error")
220
+
221
+ # Cleanup
222
+ try:
223
+ os.unlink(tmp_path)
224
+ except:
225
+ pass
226
 
227
  except Exception as e:
228
+ debug_info.append(f"πŸ’₯ Error processing file: {str(e)}")
229
+ logger.error(f"File processing error: {e}", exc_info=True)
 
 
230
 
231
+ # Process URLs
232
+ if urls and urls.strip():
233
+ debug_info.append(f"\n🌐 Processing URLs:")
234
+ for url_line in urls.strip().split('\n'):
235
+ url = url_line.strip()
236
+ if url.startswith('http'):
237
+ debug_info.append(f" πŸ“‘ {url}")
238
+ text = extract_text_from_url(url)
239
+
240
+ if len(text.strip()) > 100 and not text.startswith('URL error'):
241
+ chunks = splitter.split_text(text)
242
+ valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
243
+
244
+ for i, chunk in enumerate(valid_chunks):
245
+ docs.append(chunk)
246
+ metadata.append({
247
+ "source": url,
248
+ "chunk_id": i,
249
+ "type": "url",
250
+ "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
251
+ })
252
+
253
+ debug_info.append(f" βœ… Created {len(valid_chunks)} chunks from URL")
254
+ else:
255
+ debug_info.append(f" ⚠️ URL skipped: insufficient content")
256
 
257
+ debug_info.append(f"\nπŸ“Š SUMMARY: {len(docs)} total chunks created")
258
 
259
  if not docs:
260
+ return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-15:])
261
 
262
+ # Create FAISS index
263
  try:
264
+ debug_info.append("πŸ”„ Creating embeddings and index...")
265
+ embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
266
+ dimension = embeddings.shape[1]
267
+ index = faiss.IndexFlatL2(dimension)
268
+ index.add(embeddings.astype('float32'))
269
+
270
+ # Save index and metadata
271
  faiss.write_index(index, INDEX_PATH)
272
+ with open(METADATA_PATH, 'w', encoding='utf-8') as f:
273
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
274
+
275
+ debug_info.append(f"βœ… Index created successfully: {embeddings.shape[0]} vectors")
276
+ return f"πŸŽ‰ SUCCESS! Ingested {len(docs)} chunks from {processed_files} files.\n\n" + "\n".join(debug_info[-8:])
277
+
278
+ except Exception as e:
279
+ debug_info.append(f"πŸ’₯ Index creation failed: {str(e)}")
280
+ logger.error(f"Index creation error: {e}", exc_info=True)
281
+ return f"❌ Indexing failed: {str(e)}\n\n" + "\n".join(debug_info[-10:])
282
+
283
+ # ==============================
284
+ # RETRIEVAL
285
+ # ==============================
286
+ def retrieve_topk(query, k=5):
287
+ """Retrieve top k relevant chunks"""
288
+ try:
289
+ if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
290
+ return []
291
+
292
+ query_embedding = embed_model.encode([query], convert_to_numpy=True)
293
+ index = faiss.read_index(INDEX_PATH)
294
+ distances, indices = index.search(query_embedding.astype('float32'), k)
295
+
296
+ with open(METADATA_PATH, 'r', encoding='utf-8') as f:
297
+ metadata = json.load(f)
298
+
299
+ results = []
300
+ for i, idx in enumerate(indices[0]):
301
+ if idx < len(metadata):
302
+ results.append({
303
+ **metadata[idx],
304
+ "distance": float(distances[0][i])
305
+ })
306
+
307
+ return results[:k]
308
  except Exception as e:
309
+ logger.error(f"Retrieval error: {e}")
310
+ return []
311
 
312
  # ==============================
313
+ # GENERATION
314
  # ==============================
315
+ def ask_prompt(query):
316
+ """Generate answer based on retrieved context"""
317
+ try:
318
+ hits = retrieve_topk(query, k=3)
319
+ if not hits:
320
+ return "No relevant documents found. Please ingest some files first."
321
+
322
+ # Build context from top hits
323
+ context_parts = []
324
+ sources = []
325
+ for hit in hits:
326
+ content = hit.get('content_preview', '') or ''
327
+ if len(content) > 50:
328
+ context_parts.append(content)
329
+ source_info = f"{hit['source']} (chunk {hit['chunk_id']})"
330
+ if hit.get('distance'):
331
+ source_info += f" [relevance: {hit['distance']:.3f}]"
332
+ sources.append(source_info)
333
+
334
+ if not context_parts:
335
+ return "Retrieved documents but no content available."
336
+
337
+ context = "\n\n".join(context_parts)
338
+ full_prompt = f"""Based on the following context, answer the question.
339
 
340
+ Context:
341
+ {context}
342
 
343
+ Question: {query}
344
+
345
+ Answer:"""
346
+
347
+ # Generate response
348
+ result = gen_pipeline(
349
+ full_prompt,
350
+ max_length=400,
351
+ min_length=50,
352
+ do_sample=False,
353
+ temperature=0.1
354
+ )[0]['generated_text']
355
+
356
+ # Extract just the answer part
357
+ if "Answer:" in result:
358
+ answer = result.split("Answer:", 1)[1].strip()
359
+ else:
360
+ answer = result
361
+
362
+ response = f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
363
+ return response
364
+
365
+ except Exception as e:
366
+ logger.error(f"Generation error: {e}")
367
+ return f"Error generating response: {str(e)}"
368
 
369
  # ==============================
370
+ # GRADIO UI
371
  # ==============================
372
+ def create_ui():
373
+ with gr.Blocks(title="Research Assistant", theme=gr.themes.Soft()) as demo:
374
+ gr.Markdown("""
375
+ # πŸ” Research Assistant
376
+ Upload documents and ask questions about their content.
377
+ """)
378
+
379
+ with gr.Row():
380
+ with gr.Column(scale=1):
381
+ gr.Markdown("### πŸ“€ Document Ingestion")
382
+ file_input = gr.File(
383
+ label="Upload Files",
384
+ file_count="multiple",
385
+ file_types=[".pdf", ".docx", ".doc", ".txt", ".xlsx", ".xls", ".csv"]
386
+ )
387
+ url_input = gr.Textbox(
388
+ label="Or paste URLs (one per line)",
389
+ placeholder="https://example.com/document\nhttps://another-site.com/page",
390
+ lines=3
391
+ )
392
+ ingest_button = gr.Button("πŸš€ Ingest Documents", variant="primary", size="lg")
393
+ status_output = gr.Textbox(
394
+ label="Ingestion Status",
395
+ lines=12,
396
+ interactive=False
397
+ )
398
+
399
+ with gr.Column(scale=1):
400
+ gr.Markdown("### ❓ Ask Questions")
401
+ query_input = gr.Textbox(
402
+ label="Your Question",
403
+ placeholder="What does the document say about...",
404
+ lines=3
405
+ )
406
+ ask_button = gr.Button("πŸ’¬ Get Answer", variant="secondary")
407
+ answer_output = gr.Textbox(
408
+ label="Answer",
409
+ lines=12,
410
+ interactive=False
411
+ )
412
+
413
+ # Event handlers
414
+ ingest_button.click(
415
+ ingest_sources,
416
+ inputs=[file_input, url_input],
417
+ outputs=status_output
418
+ )
419
+
420
+ ask_button.click(
421
+ ask_prompt,
422
+ inputs=query_input,
423
+ outputs=answer_output
424
+ )
425
+
426
+ # Examples
427
+ gr.Examples(
428
+ examples=[
429
+ ["What is the main topic of the documents?"],
430
+ ["Summarize the key points."],
431
+ ["What are the dates mentioned?"],
432
+ ],
433
+ inputs=query_input
434
+ )
435
 
436
+ return demo
 
437
 
438
+ # ==============================
439
+ # MAIN
440
+ # ==============================
441
  if __name__ == "__main__":
442
+ demo = create_ui()
443
+ demo.launch(
444
+ server_name="0.0.0.0",
445
+ server_port=7860,
446
+ share=False, # Set to True for public sharing
447
+ debug=True
448
+ )