Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,11 +11,9 @@ import faiss
|
|
| 11 |
import numpy as np
|
| 12 |
from transformers import pipeline
|
| 13 |
import logging
|
| 14 |
-
import
|
| 15 |
-
import shutil
|
| 16 |
-
import re
|
| 17 |
|
| 18 |
-
# PDF libraries
|
| 19 |
try:
|
| 20 |
from pypdf import PdfReader
|
| 21 |
HAS_PYPDF = True
|
|
@@ -28,8 +26,6 @@ try:
|
|
| 28 |
except:
|
| 29 |
HAS_PDFPLUMBER = False
|
| 30 |
|
| 31 |
-
HAS_PDFTOTEXT = shutil.which('pdftotext') is not None
|
| 32 |
-
|
| 33 |
logging.basicConfig(level=logging.INFO)
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
|
@@ -42,362 +38,307 @@ INDEX_PATH = "faiss_index.index"
|
|
| 42 |
METADATA_PATH = "metadata.json"
|
| 43 |
|
| 44 |
# Initialize models
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
|
| 48 |
-
logger.info("Models loaded successfully")
|
| 49 |
-
except Exception as e:
|
| 50 |
-
logger.error(f"Model loading failed: {e}")
|
| 51 |
-
raise
|
| 52 |
|
| 53 |
# ==============================
|
| 54 |
-
# SIMPLE TEXT SPLITTER
|
| 55 |
# ==============================
|
| 56 |
def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
|
| 57 |
-
"""Simple recursive text splitter without langchain"""
|
| 58 |
if len(text) <= chunk_size:
|
| 59 |
return [text.strip()]
|
| 60 |
|
| 61 |
chunks = []
|
| 62 |
start = 0
|
| 63 |
-
|
| 64 |
while start < len(text):
|
| 65 |
-
end = start + chunk_size
|
| 66 |
-
|
| 67 |
-
# Try to split at sentence boundaries
|
| 68 |
-
if end < len(text):
|
| 69 |
-
# Look for sentence endings near chunk boundary
|
| 70 |
-
for boundary in [end-50, end-20, end]:
|
| 71 |
-
if boundary < len(text):
|
| 72 |
-
# Find sentence breaks
|
| 73 |
-
sentence_end = text.rfind('.', 0, boundary)
|
| 74 |
-
sentence_end = max(sentence_end, text.rfind('!', 0, boundary))
|
| 75 |
-
sentence_end = max(sentence_end, text.rfind('?', 0, boundary))
|
| 76 |
-
sentence_end = max(sentence_end, text.rfind('\n\n', 0, boundary))
|
| 77 |
-
|
| 78 |
-
if sentence_end > start + chunk_overlap:
|
| 79 |
-
end = sentence_end + 1
|
| 80 |
-
break
|
| 81 |
-
|
| 82 |
chunk = text[start:end].strip()
|
| 83 |
-
if len(chunk) > 50:
|
| 84 |
chunks.append(chunk)
|
| 85 |
-
|
| 86 |
start = end - chunk_overlap
|
| 87 |
-
|
| 88 |
-
return chunks
|
| 89 |
|
| 90 |
# ==============================
|
| 91 |
-
#
|
| 92 |
# ==============================
|
| 93 |
-
def
|
| 94 |
-
"""
|
| 95 |
-
|
| 96 |
|
| 97 |
-
# Method 1:
|
| 98 |
-
if
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
['pdftotext', '-layout', file_path, '-'],
|
| 102 |
-
capture_output=True, text=True, timeout=20
|
| 103 |
-
)
|
| 104 |
-
if result.returncode == 0 and len(result.stdout.strip()) > 20:
|
| 105 |
-
return result.stdout.strip()
|
| 106 |
-
except Exception as e:
|
| 107 |
-
logger.warning(f"pdftotext failed: {e}")
|
| 108 |
|
| 109 |
-
# Method 2:
|
| 110 |
-
if
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
for page in pdf.pages[:5]: # First 5 pages
|
| 114 |
-
page_text = page.extract_text()
|
| 115 |
-
if page_text:
|
| 116 |
-
text += page_text + "\n\n"
|
| 117 |
-
if len(text.strip()) > 50:
|
| 118 |
-
return text.strip()
|
| 119 |
-
except Exception as e:
|
| 120 |
-
logger.warning(f"pdfplumber failed: {e}")
|
| 121 |
|
| 122 |
-
# Method 3:
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
elif hasattr(page, 'extractText'):
|
| 133 |
-
page_text = page.extractText()
|
| 134 |
-
|
| 135 |
-
if page_text and len(page_text.strip()) > 10:
|
| 136 |
-
text += page_text + "\n\n"
|
| 137 |
-
except:
|
| 138 |
-
continue # Skip problematic pages
|
| 139 |
-
|
| 140 |
-
if len(text.strip()) > 50:
|
| 141 |
-
return text.strip()
|
| 142 |
-
except Exception as e:
|
| 143 |
-
logger.warning(f"pypdf failed: {e}")
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
# ==============================
|
| 148 |
-
#
|
| 149 |
# ==============================
|
| 150 |
-
def
|
|
|
|
|
|
|
|
|
|
| 151 |
try:
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
| 158 |
try:
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
| 175 |
|
| 176 |
-
def
|
| 177 |
try:
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
| 185 |
except:
|
| 186 |
-
return "
|
| 187 |
|
| 188 |
# ==============================
|
| 189 |
-
# MAIN INGESTION
|
| 190 |
# ==============================
|
| 191 |
def ingest_sources(files, urls=""):
|
| 192 |
docs = []
|
| 193 |
metadata = []
|
| 194 |
debug_info = []
|
| 195 |
|
| 196 |
-
# Clear existing
|
| 197 |
for path in [INDEX_PATH, METADATA_PATH]:
|
| 198 |
if os.path.exists(path):
|
| 199 |
-
|
| 200 |
-
os.remove(path)
|
| 201 |
-
debug_info.append(f"🗑️ Cleared {os.path.basename(path)}")
|
| 202 |
-
except:
|
| 203 |
-
pass
|
| 204 |
|
| 205 |
# Process files
|
| 206 |
-
for i,
|
| 207 |
-
|
| 208 |
-
name = getattr(f, 'name', f'file_{i+1}')
|
| 209 |
-
debug_info.append(f"\n📄 Processing: {os.path.basename(name) if name else 'Unknown'}")
|
| 210 |
-
|
| 211 |
-
# Create temp file
|
| 212 |
-
ext = '.txt'
|
| 213 |
-
if name:
|
| 214 |
-
ext = os.path.splitext(name)[1] or '.txt'
|
| 215 |
-
|
| 216 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
| 217 |
-
# Read file data
|
| 218 |
-
data = None
|
| 219 |
-
if hasattr(f, 'read'):
|
| 220 |
-
data = f.read()
|
| 221 |
-
if isinstance(data, str):
|
| 222 |
-
data = data.encode('utf-8')
|
| 223 |
-
elif isinstance(f, dict) and 'data' in f:
|
| 224 |
-
data = f['data']
|
| 225 |
-
|
| 226 |
-
if not data:
|
| 227 |
-
debug_info.append("❌ No file data")
|
| 228 |
-
continue
|
| 229 |
-
|
| 230 |
-
tmp.write(data)
|
| 231 |
-
tmp_path = tmp.name
|
| 232 |
-
|
| 233 |
-
# Extract text
|
| 234 |
-
file_ext = os.path.splitext(name.lower())[1] if name else ''
|
| 235 |
-
text = ""
|
| 236 |
-
|
| 237 |
-
if file_ext == '.pdf':
|
| 238 |
-
text = extract_text_from_pdf_simple(tmp_path)
|
| 239 |
-
elif file_ext in ['.docx', '.doc']:
|
| 240 |
-
text = extract_text_from_docx(tmp_path)
|
| 241 |
-
elif file_ext in ['.xlsx', '.xls', '.csv']:
|
| 242 |
-
text = extract_text_from_excel(tmp_path)
|
| 243 |
-
else:
|
| 244 |
-
text = extract_text_from_txt(tmp_path)
|
| 245 |
-
|
| 246 |
-
# Debug preview
|
| 247 |
-
preview = text[:150].replace('\n', ' ').strip()
|
| 248 |
-
if len(preview) > 100:
|
| 249 |
-
preview = preview[:100] + "..."
|
| 250 |
-
|
| 251 |
-
debug_info.append(f"📊 Extracted {len(text)} characters")
|
| 252 |
-
debug_info.append(f"🔍 Preview: '{preview}'")
|
| 253 |
-
|
| 254 |
-
# Create chunks if we have content
|
| 255 |
-
if len(text.strip()) > 30 and not text.startswith(('PDF extraction failed', 'extraction failed')):
|
| 256 |
-
chunks = simple_text_splitter(text)
|
| 257 |
-
valid_chunks = [c for c in chunks if len(c.strip()) > 20]
|
| 258 |
-
|
| 259 |
-
for j, chunk in enumerate(valid_chunks):
|
| 260 |
-
docs.append(chunk)
|
| 261 |
-
metadata.append({
|
| 262 |
-
"source": os.path.basename(name) if name else f"file_{i+1}",
|
| 263 |
-
"chunk": j,
|
| 264 |
-
"text": chunk
|
| 265 |
-
})
|
| 266 |
-
|
| 267 |
-
debug_info.append(f"�� Created {len(valid_chunks)} chunks")
|
| 268 |
-
else:
|
| 269 |
-
debug_info.append("⚠️ Skipped: insufficient content or extraction error")
|
| 270 |
-
|
| 271 |
-
# Cleanup
|
| 272 |
-
try:
|
| 273 |
-
os.unlink(tmp_path)
|
| 274 |
-
except:
|
| 275 |
-
pass
|
| 276 |
-
|
| 277 |
-
except Exception as e:
|
| 278 |
-
debug_info.append(f"💥 Error: {str(e)}")
|
| 279 |
-
logger.error(f"File processing error: {e}")
|
| 280 |
-
|
| 281 |
-
# Process URLs
|
| 282 |
-
for url in urls.strip().split('\n'):
|
| 283 |
-
url = url.strip()
|
| 284 |
-
if url.startswith('http'):
|
| 285 |
-
text = extract_text_from_url(url)
|
| 286 |
-
if len(text.strip()) > 100:
|
| 287 |
-
chunks = simple_text_splitter(text)
|
| 288 |
-
for j, chunk in enumerate(chunks):
|
| 289 |
-
if len(chunk.strip()) > 20:
|
| 290 |
-
docs.append(chunk)
|
| 291 |
-
metadata.append({
|
| 292 |
-
"source": url,
|
| 293 |
-
"chunk": j,
|
| 294 |
-
"text": chunk,
|
| 295 |
-
"type": "url"
|
| 296 |
-
})
|
| 297 |
-
|
| 298 |
-
debug_info.append(f"\n📈 TOTAL: {len(docs)} chunks created")
|
| 299 |
-
|
| 300 |
-
if not docs:
|
| 301 |
-
return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-12:])
|
| 302 |
-
|
| 303 |
-
# Create FAISS index
|
| 304 |
-
try:
|
| 305 |
-
debug_info.append("🔄 Creating embeddings...")
|
| 306 |
-
embeddings = embed_model.encode(docs, show_progress_bar=False)
|
| 307 |
-
dimension = embeddings.shape[1]
|
| 308 |
-
index = faiss.IndexFlatL2(dimension)
|
| 309 |
-
index.add(embeddings.astype('float32'))
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
|
|
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
# ==============================
|
| 323 |
# RETRIEVAL & GENERATION
|
| 324 |
# ==============================
|
| 325 |
def retrieve_topk(query, k=3):
|
| 326 |
-
|
| 327 |
-
if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
|
| 328 |
-
return []
|
| 329 |
-
|
| 330 |
-
q_emb = embed_model.encode([query], show_progress_bar=False)
|
| 331 |
-
index = faiss.read_index(INDEX_PATH)
|
| 332 |
-
distances, indices = index.search(q_emb.astype('float32'), k)
|
| 333 |
-
|
| 334 |
-
with open(METADATA_PATH, 'r', encoding='utf-8') as f:
|
| 335 |
-
metadata = json.load(f)
|
| 336 |
-
|
| 337 |
-
return [metadata[i] for i in indices[0] if i < len(metadata)]
|
| 338 |
-
except:
|
| 339 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
def ask_prompt(query):
|
| 342 |
hits = retrieve_topk(query)
|
| 343 |
if not hits:
|
| 344 |
-
return "No documents
|
| 345 |
-
|
| 346 |
-
|
|
|
|
| 347 |
sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
|
| 348 |
-
|
| 349 |
-
full_prompt = f"""Context:
|
| 350 |
-
{context}
|
| 351 |
-
|
| 352 |
-
Question: {query}
|
| 353 |
-
|
| 354 |
-
Answer:"""
|
| 355 |
-
|
| 356 |
-
try:
|
| 357 |
-
result = gen_pipeline(full_prompt, max_length=300, do_sample=False)[0]['generated_text']
|
| 358 |
-
answer = result.split('Answer:')[-1].strip() if 'Answer:' in result else result
|
| 359 |
-
return f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
|
| 360 |
-
except Exception as e:
|
| 361 |
-
return f"Generation error: {str(e)}"
|
| 362 |
|
| 363 |
# ==============================
|
| 364 |
-
#
|
| 365 |
# ==============================
|
| 366 |
-
with gr.Blocks(
|
| 367 |
-
gr.Markdown("# 🔍 Document
|
| 368 |
-
gr.Markdown("Upload PDFs, DOCX, Excel files or URLs to create a searchable knowledge base.")
|
| 369 |
-
|
| 370 |
with gr.Row():
|
| 371 |
-
with gr.Column(
|
| 372 |
-
file_input = gr.File(file_count="multiple"
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
ask_btn = gr.Button("💬 Get Answer", variant="secondary")
|
| 380 |
-
answer_output = gr.Textbox(label="Answer", lines=10)
|
| 381 |
-
|
| 382 |
-
# Events
|
| 383 |
-
ingest_btn.click(
|
| 384 |
-
ingest_sources,
|
| 385 |
-
inputs=[file_input, url_input],
|
| 386 |
-
outputs=status_output
|
| 387 |
-
)
|
| 388 |
-
ask_btn.click(
|
| 389 |
-
ask_prompt,
|
| 390 |
-
inputs=query_input,
|
| 391 |
-
outputs=answer_output
|
| 392 |
-
)
|
| 393 |
|
| 394 |
-
gr.
|
| 395 |
-
|
| 396 |
-
- **PDFs**: Works best with searchable PDFs (not scanned images)
|
| 397 |
-
- **Scanned PDFs**: Convert to searchable text first using Adobe Acrobat or online OCR
|
| 398 |
-
- **Large files**: Processing may take 1-2 minutes
|
| 399 |
-
- **Test first**: Try with a simple text file to verify setup
|
| 400 |
-
""")
|
| 401 |
|
| 402 |
if __name__ == "__main__":
|
| 403 |
demo.launch()
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
from transformers import pipeline
|
| 13 |
import logging
|
| 14 |
+
import io
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# PDF libraries
|
| 17 |
try:
|
| 18 |
from pypdf import PdfReader
|
| 19 |
HAS_PYPDF = True
|
|
|
|
| 26 |
except:
|
| 27 |
HAS_PDFPLUMBER = False
|
| 28 |
|
|
|
|
|
|
|
| 29 |
logging.basicConfig(level=logging.INFO)
|
| 30 |
logger = logging.getLogger(__name__)
|
| 31 |
|
|
|
|
| 38 |
METADATA_PATH = "metadata.json"
|
| 39 |
|
| 40 |
# Initialize models
|
| 41 |
+
embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
| 42 |
+
gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# ==============================
|
| 45 |
+
# SIMPLE TEXT SPLITTER
|
| 46 |
# ==============================
|
| 47 |
def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
|
|
|
|
| 48 |
if len(text) <= chunk_size:
|
| 49 |
return [text.strip()]
|
| 50 |
|
| 51 |
chunks = []
|
| 52 |
start = 0
|
|
|
|
| 53 |
while start < len(text):
|
| 54 |
+
end = min(start + chunk_size, len(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
chunk = text[start:end].strip()
|
| 56 |
+
if len(chunk) > 50:
|
| 57 |
chunks.append(chunk)
|
|
|
|
| 58 |
start = end - chunk_overlap
|
| 59 |
+
return [c for c in chunks if len(c) > 20]
|
|
|
|
| 60 |
|
| 61 |
# ==============================
|
| 62 |
+
# CORRECTED FILE HANDLING FOR GRADIO
|
| 63 |
# ==============================
|
| 64 |
+
def get_file_data(file_obj):
|
| 65 |
+
"""Handle different Gradio file formats correctly"""
|
| 66 |
+
debug = []
|
| 67 |
|
| 68 |
+
# Method 1: File has .name attribute (temp file path)
|
| 69 |
+
if hasattr(file_obj, 'name') and file_obj.name:
|
| 70 |
+
debug.append(f"Using file path: {file_obj.name}")
|
| 71 |
+
return file_obj.name, "path"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
# Method 2: File has .data attribute (base64 or bytes)
|
| 74 |
+
if hasattr(file_obj, 'data') and file_obj.data:
|
| 75 |
+
debug.append(f"Using file.data: {len(file_obj.data)} bytes")
|
| 76 |
+
return file_obj.data, "bytes"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
# Method 3: Try to read as bytes
|
| 79 |
+
try:
|
| 80 |
+
if hasattr(file_obj, 'read'):
|
| 81 |
+
file_obj.seek(0) # Reset file pointer
|
| 82 |
+
data = file_obj.read()
|
| 83 |
+
if data:
|
| 84 |
+
debug.append(f"Read {len(data)} bytes from file object")
|
| 85 |
+
return data, "read"
|
| 86 |
+
except Exception as e:
|
| 87 |
+
debug.append(f"Read failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
# Method 4: Check if it's a dict with content
|
| 90 |
+
if isinstance(file_obj, dict):
|
| 91 |
+
if 'data' in file_obj and file_obj['data']:
|
| 92 |
+
debug.append(f"Using dict data: {len(file_obj['data'])} bytes")
|
| 93 |
+
return file_obj['data'], "dict"
|
| 94 |
+
if 'name' in file_obj and file_obj['name']:
|
| 95 |
+
debug.append(f"Using dict path: {file_obj['name']}")
|
| 96 |
+
return file_obj['name'], "dict_path"
|
| 97 |
+
|
| 98 |
+
# Method 5: String path
|
| 99 |
+
if isinstance(file_obj, str) and os.path.exists(file_obj):
|
| 100 |
+
debug.append(f"Using string path: {file_obj}")
|
| 101 |
+
return file_obj, "string_path"
|
| 102 |
+
|
| 103 |
+
debug.append("❌ No valid file data found")
|
| 104 |
+
return None, debug
|
| 105 |
|
| 106 |
# ==============================
|
| 107 |
+
# PDF EXTRACTION
|
| 108 |
# ==============================
|
| 109 |
+
def extract_pdf_text(file_data, source_type, debug_info):
|
| 110 |
+
"""Extract text from PDF using multiple methods"""
|
| 111 |
+
temp_path = None
|
| 112 |
+
|
| 113 |
try:
|
| 114 |
+
# If we have a file path, use it directly
|
| 115 |
+
if source_type in ["path", "string_path", "dict_path"]:
|
| 116 |
+
file_path = file_data
|
| 117 |
+
if not os.path.exists(file_path):
|
| 118 |
+
debug_info.append(f"❌ File path doesn't exist: {file_path}")
|
| 119 |
+
return "File not found"
|
| 120 |
+
|
| 121 |
+
# Try pdftotext first (if available)
|
| 122 |
+
try:
|
| 123 |
+
import subprocess
|
| 124 |
+
result = subprocess.run(['pdftotext', file_path, '-'],
|
| 125 |
+
capture_output=True, text=True, timeout=15)
|
| 126 |
+
if result.returncode == 0 and len(result.stdout.strip()) > 30:
|
| 127 |
+
debug_info.append(f"✅ pdftotext: {len(result.stdout)} chars")
|
| 128 |
+
return result.stdout
|
| 129 |
+
except:
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
# Create temp file from bytes
|
| 133 |
+
if source_type in ["bytes", "read", "dict"]:
|
| 134 |
+
temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
|
| 135 |
+
with open(temp_path, 'wb') as f:
|
| 136 |
+
if isinstance(file_data, str):
|
| 137 |
+
f.write(file_data.encode('latin1')) # PDFs are binary
|
| 138 |
+
else:
|
| 139 |
+
f.write(file_data)
|
| 140 |
+
file_path = temp_path
|
| 141 |
+
debug_info.append(f"Created temp file: {temp_path}")
|
| 142 |
+
|
| 143 |
+
# Try pdfplumber
|
| 144 |
+
if HAS_PDFPLUMBER:
|
| 145 |
+
try:
|
| 146 |
+
with pdfplumber.open(file_path) as pdf:
|
| 147 |
+
text = ""
|
| 148 |
+
for i, page in enumerate(pdf.pages[:5]):
|
| 149 |
+
page_text = page.extract_text()
|
| 150 |
+
if page_text:
|
| 151 |
+
text += page_text + "\n"
|
| 152 |
+
if len(text.strip()) > 50:
|
| 153 |
+
debug_info.append(f"✅ pdfplumber: {len(text)} chars")
|
| 154 |
+
return text
|
| 155 |
+
except Exception as e:
|
| 156 |
+
debug_info.append(f"pdfplumber failed: {e}")
|
| 157 |
+
|
| 158 |
+
# Try pypdf
|
| 159 |
+
if HAS_PYPDF:
|
| 160 |
+
try:
|
| 161 |
+
reader = PdfReader(file_path)
|
| 162 |
+
text = ""
|
| 163 |
+
for i, page in enumerate(reader.pages[:3]):
|
| 164 |
+
try:
|
| 165 |
+
page_text = page.extract_text()
|
| 166 |
+
if page_text and page_text.strip():
|
| 167 |
+
text += page_text + "\n"
|
| 168 |
+
except:
|
| 169 |
+
continue
|
| 170 |
+
if len(text.strip()) > 30:
|
| 171 |
+
debug_info.append(f"✅ pypdf: {len(text)} chars")
|
| 172 |
+
return text
|
| 173 |
+
except Exception as e:
|
| 174 |
+
debug_info.append(f"pypdf failed: {e}")
|
| 175 |
+
|
| 176 |
+
return "No text extracted - likely scanned PDF images"
|
| 177 |
+
|
| 178 |
+
finally:
|
| 179 |
+
if temp_path and os.path.exists(temp_path):
|
| 180 |
+
try:
|
| 181 |
+
os.unlink(temp_path)
|
| 182 |
+
except:
|
| 183 |
+
pass
|
| 184 |
|
| 185 |
+
# ==============================
|
| 186 |
+
# OTHER EXTRACTIONS
|
| 187 |
+
# ==============================
|
| 188 |
+
def extract_docx_text(file_data, source_type, debug_info):
|
| 189 |
try:
|
| 190 |
+
if source_type == "path":
|
| 191 |
+
doc = Document(file_data)
|
| 192 |
+
else:
|
| 193 |
+
# Write to temp file
|
| 194 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
|
| 195 |
+
if isinstance(file_data, bytes):
|
| 196 |
+
tmp.write(file_data)
|
| 197 |
+
tmp_path = tmp.name
|
| 198 |
+
doc = Document(tmp_path)
|
| 199 |
+
os.unlink(tmp_path)
|
| 200 |
+
|
| 201 |
+
text = "\n\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
|
| 202 |
+
if len(text) > 20:
|
| 203 |
+
return text
|
| 204 |
+
return "No text in DOCX"
|
| 205 |
+
except Exception as e:
|
| 206 |
+
return f"DOCX error: {e}"
|
| 207 |
|
| 208 |
+
def extract_text_file(file_data, source_type, debug_info):
|
| 209 |
try:
|
| 210 |
+
if source_type == "path":
|
| 211 |
+
with open(file_data, 'r', encoding='utf-8', errors='ignore') as f:
|
| 212 |
+
return f.read()
|
| 213 |
+
else:
|
| 214 |
+
# Decode bytes
|
| 215 |
+
if isinstance(file_data, bytes):
|
| 216 |
+
return file_data.decode('utf-8', errors='ignore')
|
| 217 |
+
return str(file_data)
|
| 218 |
except:
|
| 219 |
+
return "Text extraction failed"
|
| 220 |
|
| 221 |
# ==============================
|
| 222 |
+
# MAIN INGESTION
|
| 223 |
# ==============================
|
| 224 |
def ingest_sources(files, urls=""):
|
| 225 |
docs = []
|
| 226 |
metadata = []
|
| 227 |
debug_info = []
|
| 228 |
|
| 229 |
+
# Clear existing
|
| 230 |
for path in [INDEX_PATH, METADATA_PATH]:
|
| 231 |
if os.path.exists(path):
|
| 232 |
+
os.remove(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
# Process files
|
| 235 |
+
for i, file_obj in enumerate(files or []):
|
| 236 |
+
debug_info.append(f"\n📄 Processing file {i+1}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
+
# Get file data correctly
|
| 239 |
+
file_data, source_info = get_file_data(file_obj)
|
| 240 |
+
if isinstance(source_info, list):
|
| 241 |
+
debug_info.extend(source_info)
|
| 242 |
+
continue
|
| 243 |
|
| 244 |
+
if not file_data:
|
| 245 |
+
debug_info.append("❌ No file data")
|
| 246 |
+
continue
|
| 247 |
|
| 248 |
+
# Get filename and extension
|
| 249 |
+
filename = getattr(file_obj, 'name', f'file_{i+1}')
|
| 250 |
+
if isinstance(filename, bytes):
|
| 251 |
+
filename = filename.decode('utf-8', errors='ignore')
|
| 252 |
+
ext = os.path.splitext(filename.lower())[1] if filename else ''
|
| 253 |
+
|
| 254 |
+
debug_info.append(f"File: {filename}, Type: {source_info}")
|
| 255 |
+
|
| 256 |
+
# Extract text
|
| 257 |
+
text = ""
|
| 258 |
+
if ext == '.pdf':
|
| 259 |
+
text = extract_pdf_text(file_data, source_info, debug_info)
|
| 260 |
+
elif ext in ['.docx', '.doc']:
|
| 261 |
+
text = extract_docx_text(file_data, source_info, debug_info)
|
| 262 |
+
elif ext in ['.txt', '.md']:
|
| 263 |
+
text = extract_text_file(file_data, source_info, debug_info)
|
| 264 |
+
else:
|
| 265 |
+
debug_info.append(f"Unknown extension: {ext}")
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
# Preview
|
| 269 |
+
preview = text[:100].replace('\n', ' ').strip()
|
| 270 |
+
if len(preview) > 80:
|
| 271 |
+
preview = preview[:80] + "..."
|
| 272 |
+
debug_info.append(f"Extracted {len(text)} chars")
|
| 273 |
+
debug_info.append(f"Preview: '{preview}'")
|
| 274 |
+
|
| 275 |
+
# Create chunks
|
| 276 |
+
if len(text.strip()) > 30:
|
| 277 |
+
chunks = simple_text_splitter(text)
|
| 278 |
+
for j, chunk in enumerate(chunks):
|
| 279 |
+
docs.append(chunk)
|
| 280 |
+
metadata.append({
|
| 281 |
+
"source": filename,
|
| 282 |
+
"chunk": j,
|
| 283 |
+
"text": chunk
|
| 284 |
+
})
|
| 285 |
+
debug_info.append(f"✅ {len(chunks)} chunks created")
|
| 286 |
+
else:
|
| 287 |
+
debug_info.append("⚠️ Insufficient content")
|
| 288 |
+
|
| 289 |
+
debug_info.append(f"\n📊 Total: {len(docs)} chunks")
|
| 290 |
+
|
| 291 |
+
if docs:
|
| 292 |
+
embeddings = embed_model.encode(docs)
|
| 293 |
+
index = faiss.IndexFlatL2(embeddings.shape[1])
|
| 294 |
+
index.add(embeddings)
|
| 295 |
+
faiss.write_index(index, INDEX_PATH)
|
| 296 |
+
with open(METADATA_PATH, 'w') as f:
|
| 297 |
+
json.dump(metadata, f)
|
| 298 |
+
return f"✅ SUCCESS: {len(docs)} chunks!"
|
| 299 |
+
|
| 300 |
+
return "❌ No content.\n\n" + "\n".join(debug_info[-15:])
|
| 301 |
|
| 302 |
# ==============================
|
| 303 |
# RETRIEVAL & GENERATION
|
| 304 |
# ==============================
|
| 305 |
def retrieve_topk(query, k=3):
|
| 306 |
+
if not os.path.exists(INDEX_PATH):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
return []
|
| 308 |
+
q_emb = embed_model.encode([query])
|
| 309 |
+
index = faiss.read_index(INDEX_PATH)
|
| 310 |
+
D, I = index.search(q_emb, k)
|
| 311 |
+
with open(METADATA_PATH, 'r') as f:
|
| 312 |
+
metadata = json.load(f)
|
| 313 |
+
return [metadata[i] for i in I[0] if i < len(metadata)]
|
| 314 |
|
| 315 |
def ask_prompt(query):
|
| 316 |
hits = retrieve_topk(query)
|
| 317 |
if not hits:
|
| 318 |
+
return "No documents found."
|
| 319 |
+
context = "\n\n".join([h['text'][:600] for h in hits])
|
| 320 |
+
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
| 321 |
+
result = gen_pipeline(prompt, max_length=300)[0]['generated_text']
|
| 322 |
sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
|
| 323 |
+
return f"{result}\n\nSources:\n" + "\n".join(sources)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
# ==============================
|
| 326 |
+
# UI
|
| 327 |
# ==============================
|
| 328 |
+
with gr.Blocks() as demo:
|
| 329 |
+
gr.Markdown("# 🔍 Document QA")
|
|
|
|
|
|
|
| 330 |
with gr.Row():
|
| 331 |
+
with gr.Column():
|
| 332 |
+
file_input = gr.File(file_count="multiple")
|
| 333 |
+
ingest_btn = gr.Button("Ingest", variant="primary")
|
| 334 |
+
status = gr.Textbox(lines=15)
|
| 335 |
+
with gr.Column():
|
| 336 |
+
query_input = gr.Textbox(label="Question")
|
| 337 |
+
ask_btn = gr.Button("Ask")
|
| 338 |
+
answer = gr.Textbox(lines=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
+
ingest_btn.click(ingest_sources, [file_input, gr.State("")], status)
|
| 341 |
+
ask_btn.click(ask_prompt, query_input, answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
if __name__ == "__main__":
|
| 344 |
demo.launch()
|