Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

7bb5c98

verified ·

1 Parent(s): 723c9b8

Update app.py

Browse files

Files changed (1) hide show

app.py +270 -356

app.py CHANGED Viewed

@@ -5,17 +5,32 @@ import json
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
-from pypdf import PdfReader
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import faiss
 import numpy as np
 from transformers import pipeline
-import traceback
 import logging
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -27,422 +42,321 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
-# Global variables
-embed_model = None
-splitter = None
-gen_pipeline = None
-def initialize_models():
-    global embed_model, splitter, gen_pipeline
     try:
-        embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
-        splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
-        gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
-        logger.info("Models initialized successfully")
-        return True
-    except Exception as e:
-        logger.error(f"Model initialization failed: {e}")
-        return False
-# Initialize on startup
-if not initialize_models():
-    raise RuntimeError("Failed to initialize models")
 # ==============================
-# FILE EXTRACTION FUNCTIONS
 # ==============================
-def extract_text_from_pdf(file_path):
-    """Extract text from PDF using pypdf"""
-    try:
-        logger.info(f"Extracting PDF from: {file_path}")
-        reader = PdfReader(file_path)
-        text = ""
-        page_count = len(reader.pages)
-        # Extract from first few pages only for speed
-        for i, page in enumerate(reader.pages[:5]):
-            try:
-                page_text = page.extract_text()
-                if page_text and page_text.strip():
-                    text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
-            except Exception as e:
-                logger.warning(f"Failed to extract page {i+1}: {e}")
-                continue
-        logger.info(f"PDF extraction complete: {len(text)} characters")
-        return text.strip()
-    except Exception as e:
-        logger.error(f"PDF extraction error: {e}")
-        return f"PDF extraction failed: {str(e)}"
 def extract_text_from_docx(file_path):
-    """Extract text from DOCX"""
     try:
         doc = Document(file_path)
-        paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
-        text = "\n\n".join(paragraphs)
-        logger.info(f"DOCX extraction: {len(paragraphs)} paragraphs")
-        return text
-    except Exception as e:
-        logger.error(f"DOCX extraction error: {e}")
-        return f"DOCX error: {str(e)}"
 def extract_text_from_excel(file_path):
-    """Extract text from Excel (first sheet preview)"""
     try:
-        df = pd.read_excel(file_path, sheet_name=0, nrows=50)  # Limit rows
-        text = f"Excel Sheet Preview ({df.shape[0]} rows):\n\n{df.fillna('').to_string(index=False)}"
-        logger.info(f"Excel extraction: {df.shape}")
-        return text
-    except Exception as e:
-        logger.error(f"Excel extraction error: {e}")
-        return f"Excel error: {str(e)}"
 def extract_text_from_txt(file_path):
-    """Extract text from plain text files"""
-    encodings = ['utf-8', 'latin-1', 'cp1252']
-    for encoding in encodings:
         try:
-            with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
                 return f.read()
         except:
-            continue
-    return "Could not read text file with available encodings"
 def extract_text_from_url(url):
-    """Extract text from URL"""
     try:
-        headers = {'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'}
-        r = requests.get(url, timeout=15, headers=headers)
-        r.raise_for_status()
         soup = BeautifulSoup(r.text, 'html.parser')
-        for script in soup(["script", "style", "nav", "footer"]):
-            script.decompose()
-        text = soup.get_text(separator='\n', strip=True)
-        # Clean up excessive whitespace
-        lines = [line.strip() for line in text.split('\n') if line.strip()]
-        return '\n'.join(lines[:100])  # Limit lines
-    except Exception as e:
-        logger.error(f"URL extraction error: {e}")
-        return f"URL error: {str(e)}"
 # ==============================
-# MAIN INGESTION FUNCTION
 # ==============================
 def ingest_sources(files, urls):
-    """Process files and URLs, create embeddings index"""
     docs = []
     metadata = []
     debug_info = []
-    # Clear existing index for fresh ingestion
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
             os.remove(path)
-            debug_info.append(f"🗑️ Cleared existing {os.path.basename(path)}")
-    # Process files
-    processed_files = 0
     for f in files or []:
-        processed_files += 1
-        try:
-            # Get filename
-            name = getattr(f, 'name', f'file_{processed_files}')
-            if not name:
-                name = f'uploaded_file_{processed_files}'
-            debug_info.append(f"\n📁 Processing: {os.path.basename(name)}")
-            # Create temp file
-            suffix = os.path.splitext(name)[1] or '.txt'
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir='/tmp') as tmp:
-                # Handle different Gradio file formats
-                file_data = None
                 if hasattr(f, 'read'):
-                    file_data = f.read()
-                    if isinstance(file_data, str):
-                        file_data = file_data.encode('utf-8')
-                elif isinstance(f, dict) and 'data' in f:
-                    file_data = f['data']
-                    if isinstance(file_data, str):
-                        file_data = file_data.encode('utf-8')
-                elif isinstance(f, str):
-                    file_data = f.encode('utf-8')
-                if not file_data:
-                    debug_info.append("❌ No file data available")
-                    continue
-                tmp.write(file_data)
                 tmp_path = tmp.name
-                tmp.flush()
-            # Extract text based on extension
-            ext = os.path.splitext(name.lower())[1]
-            text = ""
-            if ext == '.pdf':
-                text = extract_text_from_pdf(tmp_path)
-            elif ext in ['.doc', '.docx']:
-                text = extract_text_from_docx(tmp_path)
-            elif ext in ['.xls', '.xlsx', '.csv']:
-                text = extract_text_from_excel(tmp_path)
-            else:
-                text = extract_text_from_txt(tmp_path)
-            # Show preview of extracted content
-            preview = text[:200].replace('\n', ' ').strip()
-            if len(preview) > 100:
-                preview = preview[:100] + "..."
-            debug_info.append(f"📄 Extracted {len(text)} chars")
-            debug_info.append(f"🔍 Preview: '{preview}'")
-            # Create chunks if we have substantial content
-            if len(text.strip()) > 30 and not text.startswith(('error', 'PDF extraction failed')):
-                chunks = splitter.split_text(text)
-                valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
-                for i, chunk in enumerate(valid_chunks):
-                    docs.append(chunk)
-                    metadata.append({
-                        "source": os.path.basename(name),
-                        "chunk_id": i,
-                        "type": "file",
-                        "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
-                    })
-                debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
-            else:
-                debug_info.append("⚠️ Skipped: insufficient content or extraction error")
-            # Cleanup
-            try:
-                os.unlink(tmp_path)
-            except:
-                pass
-        except Exception as e:
-            debug_info.append(f"💥 Error processing file: {str(e)}")
-            logger.error(f"File processing error: {e}", exc_info=True)
-    # Process URLs
-    if urls and urls.strip():
-        debug_info.append(f"\n🌐 Processing URLs:")
-        for url_line in urls.strip().split('\n'):
-            url = url_line.strip()
-            if url.startswith('http'):
-                debug_info.append(f"  📡 {url}")
-                text = extract_text_from_url(url)
-                if len(text.strip()) > 100 and not text.startswith('URL error'):
                     chunks = splitter.split_text(text)
-                    valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
                     for i, chunk in enumerate(valid_chunks):
                         docs.append(chunk)
                         metadata.append({
-                            "source": url,
-                            "chunk_id": i,
-                            "type": "url",
-                            "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
                         })
-                    debug_info.append(f"  ✅ Created {len(valid_chunks)} chunks from URL")
                 else:
-                    debug_info.append(f"  ⚠️ URL skipped: insufficient content")
-    debug_info.append(f"\n📊 SUMMARY: {len(docs)} total chunks created")
-    if not docs:
-        return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-15:])
-    # Create FAISS index
-    try:
-        debug_info.append("🔄 Creating embeddings and index...")
-        embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dimension)
-        index.add(embeddings.astype('float32'))
-        # Save index and metadata
-        faiss.write_index(index, INDEX_PATH)
-        with open(METADATA_PATH, 'w', encoding='utf-8') as f:
-            json.dump(metadata, f, ensure_ascii=False, indent=2)
-        debug_info.append(f"✅ Index created successfully: {embeddings.shape[0]} vectors")
-        return f"🎉 SUCCESS! Ingested {len(docs)} chunks from {processed_files} files.\n\n" + "\n".join(debug_info[-8:])
-    except Exception as e:
-        debug_info.append(f"💥 Index creation failed: {str(e)}")
-        logger.error(f"Index creation error: {e}", exc_info=True)
-        return f"❌ Indexing failed: {str(e)}\n\n" + "\n".join(debug_info[-10:])
 # ==============================
-# RETRIEVAL
 # ==============================
-def retrieve_topk(query, k=5):
-    """Retrieve top k relevant chunks"""
-    try:
-        if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
-            return []
-        query_embedding = embed_model.encode([query], convert_to_numpy=True)
-        index = faiss.read_index(INDEX_PATH)
-        distances, indices = index.search(query_embedding.astype('float32'), k)
-        with open(METADATA_PATH, 'r', encoding='utf-8') as f:
-            metadata = json.load(f)
-        results = []
-        for i, idx in enumerate(indices[0]):
-            if idx < len(metadata):
-                results.append({
-                    **metadata[idx],
-                    "distance": float(distances[0][i])
-                })
-        return results[:k]
-    except Exception as e:
-        logger.error(f"Retrieval error: {e}")
         return []
-# ==============================
-# GENERATION
-# ==============================
 def ask_prompt(query):
-    """Generate answer based on retrieved context"""
-    try:
-        hits = retrieve_topk(query, k=3)
-        if not hits:
-            return "No relevant documents found. Please ingest some files first."
-        # Build context from top hits
-        context_parts = []
-        sources = []
-        for hit in hits:
-            content = hit.get('content_preview', '') or ''
-            if len(content) > 50:
-                context_parts.append(content)
-                source_info = f"{hit['source']} (chunk {hit['chunk_id']})"
-                if hit.get('distance'):
-                    source_info += f" [relevance: {hit['distance']:.3f}]"
-                sources.append(source_info)
-        if not context_parts:
-            return "Retrieved documents but no content available."
-        context = "\n\n".join(context_parts)
-        full_prompt = f"""Based on the following context, answer the question.
-Context:
-{context}
-Question: {query}
-Answer:"""
-        # Generate response
-        result = gen_pipeline(
-            full_prompt,
-            max_length=400,
-            min_length=50,
-            do_sample=False,
-            temperature=0.1
-        )[0]['generated_text']
-        # Extract just the answer part
-        if "Answer:" in result:
-            answer = result.split("Answer:", 1)[1].strip()
-        else:
-            answer = result
-        response = f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
-        return response
-    except Exception as e:
-        logger.error(f"Generation error: {e}")
-        return f"Error generating response: {str(e)}"
 # ==============================
 # GRADIO UI
 # ==============================
-def create_ui():
-    with gr.Blocks(title="Research Assistant", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🔍 Research Assistant
-        Upload documents and ask questions about their content.
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### 📤 Document Ingestion")
-                file_input = gr.File(
-                    label="Upload Files",
-                    file_count="multiple",
-                    file_types=[".pdf", ".docx", ".doc", ".txt", ".xlsx", ".xls", ".csv"]
-                )
-                url_input = gr.Textbox(
-                    label="Or paste URLs (one per line)",
-                    placeholder="https://example.com/document\nhttps://another-site.com/page",
-                    lines=3
-                )
-                ingest_button = gr.Button("🚀 Ingest Documents", variant="primary", size="lg")
-                status_output = gr.Textbox(
-                    label="Ingestion Status",
-                    lines=12,
-                    interactive=False
-                )
-            with gr.Column(scale=1):
-                gr.Markdown("### ❓ Ask Questions")
-                query_input = gr.Textbox(
-                    label="Your Question",
-                    placeholder="What does the document say about...",
-                    lines=3
-                )
-                ask_button = gr.Button("💬 Get Answer", variant="secondary")
-                answer_output = gr.Textbox(
-                    label="Answer",
-                    lines=12,
-                    interactive=False
-                )
-        # Event handlers
-        ingest_button.click(
-            ingest_sources,
-            inputs=[file_input, url_input],
-            outputs=status_output
-        )
-        ask_button.click(
-            ask_prompt,
-            inputs=query_input,
-            outputs=answer_output
-        )
-        # Examples
-        gr.Examples(
-            examples=[
-                ["What is the main topic of the documents?"],
-                ["Summarize the key points."],
-                ["What are the dates mentioned?"],
-            ],
-            inputs=query_input
-        )
-    return demo
-# ==============================
-# MAIN
-# ==============================
 if __name__ == "__main__":
-    demo = create_ui()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,  # Set to True for public sharing
-        debug=True
-    )

 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import faiss
 import numpy as np
 from transformers import pipeline
 import logging
+import subprocess
+import shutil
+# Try importing PDF libraries with fallbacks
+try:
+    from pypdf import PdfReader
+    HAS_PYPDF = True
+except:
+    HAS_PYPDF = False
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except:
+    HAS_PDFPLUMBER = False
+# Fallback: use pdftotext if available (common on Linux systems)
+HAS_PDFTOTEXT = shutil.which('pdftotext') is not None
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
+embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
+# ==============================
+# ROBUST PDF EXTRACTION WITH FALLBACKS
+# ==============================
+def extract_text_from_pdf_robust(file_path):
+    """Try multiple PDF extraction methods in order of reliability"""
+    methods_tried = []
+    text = ""
+    debug_info = []
+    # Method 1: pdftotext (most reliable, system command)
+    if HAS_PDFTOTEXT:
+        try:
+            debug_info.append("Trying pdftotext...")
+            result = subprocess.run(
+                ['pdftotext', '-layout', file_path, '-'],
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0 and result.stdout.strip():
+                text = result.stdout
+                debug_info.append(f"✅ pdftotext success: {len(text)} chars")
+                return text, debug_info
+        except Exception as e:
+            debug_info.append(f"pdftotext failed: {e}")
+    # Method 2: pdfplumber (good for complex layouts)
+    if HAS_PDFPLUMBER:
+        try:
+            debug_info.append("Trying pdfplumber...")
+            with pdfplumber.open(file_path) as pdf:
+                for i, page in enumerate(pdf.pages[:10]):  # Limit pages
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {i+1} ---\n{page_text}"
+            if len(text.strip()) > 50:
+                debug_info.append(f"✅ pdfplumber success: {len(text)} chars")
+                return text, debug_info
+        except Exception as e:
+            debug_info.append(f"pdfplumber failed: {e}")
+    # Method 3: pypdf with error handling
+    if HAS_PYPDF:
+        try:
+            debug_info.append("Trying pypdf...")
+            reader = PdfReader(file_path)
+            page_count = len(reader.pages)
+            for i, page in enumerate(reader.pages[:5]):  # First 5 pages only
+                try:
+                    # Try different extraction methods
+                    if hasattr(page, 'extract_text'):
+                        page_text = page.extract_text()
+                    elif hasattr(page, 'extractText'):
+                        page_text = page.extractText()
+                    else:
+                        continue
+                    if page_text and page_text.strip():
+                        text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
+                except Exception as page_e:
+                    debug_info.append(f"Page {i+1} failed: {page_e}")
+                    continue
+            if len(text.strip()) > 50:
+                debug_info.append(f"✅ pypdf success: {len(text)} chars")
+                return text, debug_info
+            else:
+                debug_info.append(f"pypdf extracted only {len(text)} chars")
+        except Exception as e:
+            debug_info.append(f"pypdf failed: {e}")
+    # Method 4: Check if it's just an image/scanned PDF
     try:
+        file_size = os.path.getsize(file_path)
+        with open(file_path, 'rb') as f:
+            header = f.read(1024)
+            if b'%PDF' not in header:
+                return "Invalid PDF format", debug_info
+            if b'/Encrypt' in header:
+                return "PDF is password protected", debug_info
+    except:
+        pass
+    debug_info.append("❌ All PDF methods failed")
+    return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned images.", debug_info
+def extract_text_from_pdf_simple(file_path):
+    """Simplified fallback - just try to get ANY text"""
+    all_text = ""
+    # Try pdftotext first (most reliable)
+    if HAS_PDFTOTEXT:
+        try:
+            result = subprocess.run(
+                ['pdftotext', file_path, '-'],
+                capture_output=True, text=True, timeout=10
+            )
+            if result.returncode == 0:
+                all_text = result.stdout
+                if len(all_text.strip()) > 20:
+                    return all_text
+        except:
+            pass
+    # Try pdfplumber
+    if HAS_PDFPLUMBER:
+        try:
+            import pdfplumber
+            with pdfplumber.open(file_path) as pdf:
+                for page in pdf.pages[:3]:
+                    text = page.extract_text()
+                    if text:
+                        all_text += text + "\n"
+            if len(all_text.strip()) > 20:
+                return all_text
+        except:
+            pass
+    # Last resort: pypdf with minimal error handling
+    if HAS_PYPDF:
+        try:
+            reader = PdfReader(file_path)
+            for page in reader.pages[:2]:
+                try:
+                    text = page.extract_text()
+                    if text and len(text.strip()) > 10:
+                        all_text += text + "\n"
+                except:
+                    continue
+            return all_text
+        except:
+            pass
+    return "PDF extraction completely failed - likely scanned images with no text layer"
 # ==============================
+# OTHER EXTRACTION FUNCTIONS
 # ==============================
 def extract_text_from_docx(file_path):
     try:
         doc = Document(file_path)
+        return "\n\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+    except:
+        return "DOCX extraction failed"
 def extract_text_from_excel(file_path):
     try:
+        df = pd.read_excel(file_path, nrows=30)
+        return df.to_string()
+    except:
+        return "Excel extraction failed"
 def extract_text_from_txt(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            return f.read()
+    except:
         try:
+            with open(file_path, 'r', encoding='latin-1') as f:
                 return f.read()
         except:
+            return "Text extraction failed"
 def extract_text_from_url(url):
     try:
+        r = requests.get(url, timeout=10)
         soup = BeautifulSoup(r.text, 'html.parser')
+        return soup.get_text(separator='\n', strip=True)[:2000]
+    except:
+        return "URL extraction failed"
 # ==============================
+# MAIN INGESTION
 # ==============================
 def ingest_sources(files, urls):
     docs = []
     metadata = []
     debug_info = []
+    # Clear existing index
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
             os.remove(path)
+    processed = 0
     for f in files or []:
+        processed += 1
+        name = getattr(f, 'name', f'file_{processed}')
+        debug_info.append(f"\n📄 Processing: {os.path.basename(name)}")
+        # Save to temp file
+        suffix = os.path.splitext(name)[1] or '.pdf'
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            try:
                 if hasattr(f, 'read'):
+                    data = f.read()
+                else:
+                    data = f if isinstance(f, bytes) else str(f).encode()
+                tmp.write(data)
                 tmp_path = tmp.name
+                ext = os.path.splitext(name.lower())[1]
+                text = ""
+                if ext == '.pdf':
+                    text = extract_text_from_pdf_simple(tmp_path)
+                elif ext == '.docx':
+                    text = extract_text_from_docx(tmp_path)
+                elif ext in ['.xls', '.xlsx']:
+                    text = extract_text_from_excel(tmp_path)
+                else:
+                    text = extract_text_from_txt(tmp_path)
+                preview = text[:150].replace('\n', ' ').strip()
+                if len(preview) > 100:
+                    preview = preview[:100] + "..."
+                debug_info.append(f"Extracted {len(text)} chars")
+                debug_info.append(f"Preview: '{preview}'")
+                # Accept ANY substantial text
+                if len(text.strip()) > 25:
                     chunks = splitter.split_text(text)
+                    valid_chunks = [c for c in chunks if len(c.strip()) > 15]
                     for i, chunk in enumerate(valid_chunks):
                         docs.append(chunk)
                         metadata.append({
+                            "source": os.path.basename(name),
+                            "chunk": i,
+                            "text": chunk[:900]  # Limit stored text
                         })
+                    debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
                 else:
+                    debug_info.append("⚠️ Too little content")
+            finally:
+                try:
+                    os.unlink(tmp.name)
+                except:
+                    pass
+    debug_info.append(f"\n📊 Total chunks: {len(docs)}")
+    if docs:
+        try:
+            embeddings = embed_model.encode(docs)
+            index = faiss.IndexFlatL2(embeddings.shape[1])
+            index.add(embeddings)
+            faiss.write_index(index, INDEX_PATH)
+            with open(METADATA_PATH, 'w') as f:
+                json.dump(metadata, f)
+            return f"✅ SUCCESS: {len(docs)} chunks indexed!"
+        except Exception as e:
+            return f"❌ Indexing failed: {e}"
+    return "❌ No valid content.\n\n" + "\n".join(debug_info)
 # ==============================
+# RETRIEVAL & GENERATION (unchanged)
 # ==============================
+def retrieve_topk(query, k=3):
+    if not os.path.exists(INDEX_PATH):
         return []
+    q_emb = embed_model.encode([query])
+    index = faiss.read_index(INDEX_PATH)
+    D, I = index.search(q_emb, k)
+    with open(METADATA_PATH, 'r') as f:
+        metadata = json.load(f)
+    return [metadata[i] for i in I[0] if i < len(metadata)]
 def ask_prompt(query):
+    hits = retrieve_topk(query)
+    if not hits:
+        return "No documents ingested."
+    context = "\n\n".join([h['text'] for h in hits])
+    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
+    result = gen_pipeline(prompt, max_length=300)[0]['generated_text']
+    sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
+    return f"{result}\n\nSources:\n" + "\n".join(sources)
 # ==============================
 # GRADIO UI
 # ==============================
+with gr.Blocks() as demo:
+    gr.Markdown("# 🔍 Document QA - Fixed PDF Extraction")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(file_count="multiple")
+            ingest_btn = gr.Button("Ingest", variant="primary")
+            status = gr.Textbox(label="Status", lines=15)
+        with gr.Column():
+            query_input = gr.Textbox(label="Question")
+            ask_btn = gr.Button("Ask")
+            answer = gr.Textbox(label="Answer", lines=10)
+    ingest_btn.click(ingest_sources, [file_input, gr.State("")], status)
+    ask_btn.click(ask_prompt, query_input, answer)
 if __name__ == "__main__":
+    demo.launch()