Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

d6c6abe

verified ·

1 Parent(s): 179339d

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -141

app.py CHANGED Viewed

@@ -14,6 +14,19 @@ import numpy as np
 from transformers import pipeline
 import traceback
 # ==============================
 # CONFIG
 # ==============================
@@ -22,258 +35,264 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
-# Load embedding model
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 # ==============================
-# Enhanced text extractors with debugging
 # ==============================
-def extract_text_from_pdf(file_path):
     try:
-        print(f"Processing PDF: {file_path}")
         reader = PdfReader(file_path)
-        pages = []
         for i, page in enumerate(reader.pages):
-            text = page.extract_text()
-            if text and text.strip():
-                pages.append(f"Page {i+1}:\n{text}")
-            else:
-                print(f"Warning: Page {i+1} has no extractable text")
-        result = "\n\n".join(pages)
-        print(f"PDF extracted {len(pages)} pages, {len(result)} chars")
-        return result if result.strip() else "No text found in PDF (possibly scanned image)"
     except Exception as e:
-        print(f"PDF error: {str(e)}")
-        return f"PDF error: {str(e)}"
 def extract_text_from_docx(file_path):
     try:
-        print(f"Processing DOCX: {file_path}")
         doc = Document(file_path)
-        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-        result = "\n\n".join(paragraphs)
-        print(f"DOCX extracted {len(paragraphs)} paragraphs, {len(result)} chars")
-        return result if result.strip() else "No text found in DOCX"
     except Exception as e:
-        print(f"DOCX error: {str(e)}")
         return f"DOCX error: {str(e)}"
 def extract_text_from_excel(file_path):
     try:
-        print(f"Processing Excel: {file_path}")
-        # Try first sheet only for speed
-        df = pd.read_excel(file_path, sheet_name=0)
-        result = f"Sheet: {df.shape}\n{df.fillna('').to_string()}"
-        print(f"Excel extracted {df.shape[0]} rows, {len(result)} chars")
-        return result
     except Exception as e:
-        print(f"Excel error: {str(e)}")
         return f"Excel error: {str(e)}"
 def extract_text_from_txt(file_path):
     try:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
-    except Exception as e:
-        return f"TXT error: {str(e)}"
 def extract_text_from_url(url):
     try:
         r = requests.get(url, timeout=10)
-        soup = BeautifulSoup(r.text, "html.parser")  # Use html.parser as fallback
         for s in soup(["script", "style"]):
             s.decompose()
         text = soup.get_text(separator="\n", strip=True)
-        return text[:5000]  # Limit length
     except Exception as e:
         return f"URL error: {str(e)}"
 # ==============================
-# Text chunking
 # ==============================
 splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
 # ==============================
-# Enhanced ingestion with debug output
 # ==============================
 def ingest_sources(files, urls):
     docs = []
     metadata = []
     debug_info = []
-    # Clear existing index for fresh start during testing
-    if os.path.exists(INDEX_PATH):
-        os.remove(INDEX_PATH)
-        debug_info.append("Cleared existing index")
-    if os.path.exists(METADATA_PATH):
-        os.remove(METADATA_PATH)
-        debug_info.append("Cleared existing metadata")
-    # Process files
-    processed_files = 0
     for f in files or []:
-        processed_files += 1
-        name = getattr(f, "name", f"file_{processed_files}")
-        debug_info.append(f"Processing: {name}")
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
         try:
-            # Handle different file types from Gradio
             data = None
             if hasattr(f, 'read'):
                 data = f.read()
             elif isinstance(f, str):
                 data = f.encode('utf-8')
             elif isinstance(f, dict) and 'data' in f:
                 data = f['data']
-                if isinstance(data, str):
-                    data = data.encode('utf-8')
-            if data is None:
-                debug_info.append(f"Failed: Could not read {name}")
                 continue
             tmp.write(data)
             tmp.flush()
-            # Extract based on extension
             ext = os.path.splitext(name.lower())[1]
             if ext == '.pdf':
-                text = extract_text_from_pdf(tmp.name)
             elif ext == '.docx':
                 text = extract_text_from_docx(tmp.name)
             elif ext in ['.xls', '.xlsx']:
                 text = extract_text_from_excel(tmp.name)
-            elif ext in ['.txt', '.md']:
-                text = extract_text_from_txt(tmp.name)
             else:
                 text = extract_text_from_txt(tmp.name)
-            debug_info.append(f"Extracted {len(text)} chars from {name}")
-            if len(text) > 50 and "error" not in text.lower():
                 chunks = splitter.split_text(text)
-                for i, chunk in enumerate(chunks):
-                    if len(chunk.strip()) > 10:
-                        docs.append(chunk)
-                        metadata.append({
-                            "source": name,
-                            "chunk": i,
-                            "type": "file",
-                            "text": chunk
-                        })
-                debug_info.append(f"Created {len(chunks)} chunks from {name}")
             else:
-                debug_info.append(f"Skipped {name}: too short or error")
         except Exception as e:
-            debug_info.append(f"Error processing {name}: {str(e)}")
         finally:
-            try:
-                os.unlink(tmp.name)
-            except:
-                pass
-    # Process URLs
-    processed_urls = 0
     for url in (urls or "").splitlines():
-        url = url.strip()
-        if url:
-            processed_urls += 1
-            debug_info.append(f"Fetching URL: {url}")
-            text = extract_text_from_url(url)
             if len(text) > 100 and "error" not in text.lower():
                 chunks = splitter.split_text(text)
-                for i, chunk in enumerate(chunks):
-                    if len(chunk.strip()) > 10:
-                        docs.append(chunk)
-                        metadata.append({
-                            "source": url,
-                            "chunk": i,
-                            "type": "url",
-                            "text": chunk
-                        })
-                debug_info.append(f"Created {len(chunks)} chunks from {url}")
-            else:
-                debug_info.append(f"Skipped URL {url}: insufficient content")
-    debug_info.append(f"Total chunks created: {len(docs)}")
     if not docs:
-        return "❌ No valid text extracted.\n\nDebug info:\n" + "\n".join(debug_info[:10])
-    # Create embeddings and index
     try:
-        print(f"Creating embeddings for {len(docs)} chunks...")
-        embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
-        dim = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dim)
         index.add(embeddings)
         faiss.write_index(index, INDEX_PATH)
         with open(METADATA_PATH, "w", encoding="utf-8") as f:
-            json.dump(metadata, f, ensure_ascii=False, indent=2)
-        return f"✅ Success! Ingested {len(docs)} chunks.\n\nDebug: {len(files or [])} files, {processed_urls} URLs processed."
     except Exception as e:
-        return f"❌ Indexing failed: {str(e)}\n\nDebug info:\n" + "\n".join(debug_info)
 # ==============================
-# Retrieval
 # ==============================
-def retrieve_topk(query, k=5):
-    if not os.path.exists(INDEX_PATH):
-        return []
     q_emb = embed_model.encode([query])
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
-    with open(METADATA_PATH, "r", encoding="utf-8") as f:
         metadata = json.load(f)
     return [metadata[idx] for idx in I[0] if idx < len(metadata)]
-# ==============================
-# Generation
-# ==============================
 gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 def ask_prompt(prompt):
-    hits = retrieve_topk(prompt, k=3)
-    if not hits:
-        return "No documents ingested or no relevant matches found."
-    context = "\n\n".join([h.get("text", "")[:1000] for h in hits])  # Limit context length
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
-    full_prompt = f"Context:\n{context}\n\nQuestion: {prompt}\n\nAnswer:"
-    try:
-        result = gen_pipeline(full_prompt, max_length=300, do_sample=False)[0]["generated_text"]
-        return f"{result}\n\n**Sources:**\n" + "\n".join(sources)
-    except Exception as e:
-        return f"Generation error: {str(e)}"
 # ==============================
-# Gradio UI
 # ==============================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🔍 Research Assistant\nUpload files and click **Ingest** to see debug info.")
     with gr.Row():
         with gr.Column():
-            file_in = gr.File(label="Upload files", file_count="multiple")
-            urls_in = gr.Textbox(label="URLs (one per line)", placeholder="https://example.com")
             ingest_btn = gr.Button("Ingest", variant="primary")
-            ingest_output = gr.Textbox(label="Status & Debug Info", lines=8)
         with gr.Column():
-            prompt_in = gr.Textbox(label="Ask a question", lines=3)
             ask_btn = gr.Button("Ask")
-            answer_out = gr.Textbox(label="Answer", lines=10)
-    ingest_btn.click(ingest_sources, inputs=[file_in, urls_in], outputs=ingest_output)
-    ask_btn.click(ask_prompt, inputs=prompt_in, outputs=answer_out)
 if __name__ == "__main__":
     demo.launch()

 from transformers import pipeline
 import traceback
+# Try multiple PDF libraries
+try:
+    from PyPDF2 import PdfReader as PyPDF2Reader
+    HAS_PYPDF2 = True
+except ImportError:
+    HAS_PYPDF2 = False
+try:
+    import pdfplumber
+    HAS_PDFPLUMBER = True
+except ImportError:
+    HAS_PDFPLUMBER = False
 # ==============================
 # CONFIG
 # ==============================
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 # ==============================
+# ROBUST PDF EXTRACTION
 # ==============================
+def extract_text_from_pdf_robust(file_path):
+    """Try multiple PDF extraction methods"""
+    methods_tried = []
+    # Method 1: pdfplumber (best for tables/forms)
+    if HAS_PDFPLUMBER:
+        try:
+            methods_tried.append("pdfplumber")
+            with pdfplumber.open(file_path) as pdf:
+                text = ""
+                for i, page in enumerate(pdf.pages):
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {i+1} ---\n{page_text}"
+                if len(text.strip()) > 50:
+                    print(f"pdfplumber success: {len(text)} chars")
+                    return text
+        except Exception as e:
+            print(f"pdfplumber failed: {e}")
+    # Method 2: pypdf (original)
     try:
+        methods_tried.append("pypdf")
         reader = PdfReader(file_path)
+        text = ""
         for i, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if page_text and page_text.strip():
+                text += f"\n--- Page {i+1} ---\n{page_text}"
+        if len(text.strip()) > 50:
+            print(f"pypdf success: {len(text)} chars")
+            return text
+        print(f"pypdf extracted only {len(text)} chars")
     except Exception as e:
+        print(f"pypdf failed: {e}")
+    # Method 3: PyPDF2 fallback
+    if HAS_PYPDF2:
+        try:
+            methods_tried.append("PyPDF2")
+            reader = PyPDF2Reader(file_path)
+            text = ""
+            for i, page in enumerate(reader.pages):
+                page_text = page.extract_text()
+                if page_text and page_text.strip():
+                    text += f"\n--- Page {i+1} ---\n{page_text}"
+            if len(text.strip()) > 50:
+                print(f"PyPDF2 success: {len(text)} chars")
+                return text
+        except Exception as e:
+            print(f"PyPDF2 failed: {e}")
+    # Method 4: Raw bytes check (detect encrypted/scanned)
+    try:
+        with open(file_path, 'rb') as f:
+            content = f.read(1024)
+            if b'/Encrypt' in content:
+                return "PDF is encrypted/protected. Please remove password."
+            if len(content) < 10000:
+                return "PDF appears to be scanned images (no text layer). Try OCR tools."
+    except:
+        pass
+    return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned PDF or protected."
+# ==============================
+# Other extractors (keep simple)
+# ==============================
 def extract_text_from_docx(file_path):
     try:
         doc = Document(file_path)
+        paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+        return "\n\n".join(paragraphs) if paragraphs else "No text in DOCX"
     except Exception as e:
         return f"DOCX error: {str(e)}"
 def extract_text_from_excel(file_path):
     try:
+        df = pd.read_excel(file_path, sheet_name=0, nrows=100)  # Limit rows
+        return f"Sheet preview:\n{df.fillna('').to_csv(index=False)}"
     except Exception as e:
         return f"Excel error: {str(e)}"
 def extract_text_from_txt(file_path):
     try:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
+    except:
+        with open(file_path, "r", encoding="latin-1", errors="ignore") as f:
+            return f.read()
 def extract_text_from_url(url):
     try:
         r = requests.get(url, timeout=10)
+        soup = BeautifulSoup(r.text, "html.parser")
         for s in soup(["script", "style"]):
             s.decompose()
         text = soup.get_text(separator="\n", strip=True)
+        return text[:3000]  # Limit
     except Exception as e:
         return f"URL error: {str(e)}"
 # ==============================
+# Chunking
 # ==============================
 splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
 # ==============================
+# SIMPLIFIED INGESTION (focus on PDF fix)
 # ==============================
 def ingest_sources(files, urls):
     docs = []
     metadata = []
     debug_info = []
+    # Clear old index for testing
+    for path in [INDEX_PATH, METADATA_PATH]:
+        if os.path.exists(path):
+            os.remove(path)
+            debug_info.append(f"Cleared {path}")
+    processed = 0
     for f in files or []:
+        processed += 1
+        name = getattr(f, "name", f"file_{processed}")
+        debug_info.append(f"\n🔍 Processing: {name}")
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
         try:
+            # Handle Gradio file formats
             data = None
             if hasattr(f, 'read'):
                 data = f.read()
+                if isinstance(data, str): data = data.encode('utf-8')
             elif isinstance(f, str):
                 data = f.encode('utf-8')
             elif isinstance(f, dict) and 'data' in f:
                 data = f['data']
+                if isinstance(data, str): data = data.encode('utf-8')
+            if not data:
+                debug_info.append("❌ Could not read file data")
                 continue
             tmp.write(data)
             tmp.flush()
+            # Extract text
             ext = os.path.splitext(name.lower())[1]
             if ext == '.pdf':
+                text = extract_text_from_pdf_robust(tmp.name)
             elif ext == '.docx':
                 text = extract_text_from_docx(tmp.name)
             elif ext in ['.xls', '.xlsx']:
                 text = extract_text_from_excel(tmp.name)
             else:
                 text = extract_text_from_txt(tmp.name)
+            debug_info.append(f"📄 Extracted {len(text)} characters")
+            # Lower threshold for "valid" content
+            if len(text) > 20 and "error" not in text.lower() and "no text" not in text.lower():
                 chunks = splitter.split_text(text)
+                valid_chunks = [c for c in chunks if len(c.strip()) > 20]
+                for i, chunk in enumerate(valid_chunks):
+                    docs.append(chunk)
+                    metadata.append({
+                        "source": name,
+                        "chunk": i,
+                        "type": "file",
+                        "text": chunk
+                    })
+                debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
             else:
+                debug_info.append(f"⚠️ Skipped: {'too short' if len(text) <= 20 else 'contains error'}")
         except Exception as e:
+            debug_info.append(f"💥 Error: {str(e)}")
         finally:
+            try: os.unlink(tmp.name)
+            except: pass
+    # URLs (simplified)
     for url in (urls or "").splitlines():
+        if url.strip():
+            text = extract_text_from_url(url.strip())
             if len(text) > 100 and "error" not in text.lower():
                 chunks = splitter.split_text(text)
+                for i, c in enumerate(chunks):
+                    if len(c.strip()) > 20:
+                        docs.append(c)
+                        metadata.append({"source": url, "chunk": i, "type": "url", "text": c})
+    debug_info.append(f"\n📊 Total: {len(docs)} chunks created")
     if not docs:
+        return "❌ No valid content.\n\n" + "\n".join(debug_info)
+    # Build index
     try:
+        embeddings = embed_model.encode(docs, show_progress_bar=False)
+        index = faiss.IndexFlatL2(embeddings.shape[1])
         index.add(embeddings)
         faiss.write_index(index, INDEX_PATH)
         with open(METADATA_PATH, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, ensure_ascii=False)
+        return f"✅ SUCCESS: {len(docs)} chunks ingested!\n\n" + "\n".join(debug_info[-5:])
     except Exception as e:
+        return f"❌ Index failed: {str(e)}\n\n" + "\n".join(debug_info)
 # ==============================
+# Keep retrieval and generation simple
 # ==============================
+def retrieve_topk(query, k=3):
+    if not os.path.exists(INDEX_PATH): return []
     q_emb = embed_model.encode([query])
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
+    with open(METADATA_PATH, "r") as f:
         metadata = json.load(f)
     return [metadata[idx] for idx in I[0] if idx < len(metadata)]
 gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 def ask_prompt(prompt):
+    hits = retrieve_topk(prompt)
+    if not hits: return "No documents ingested."
+    context = "\n\n".join([h.get("text", "")[:800] for h in hits])
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
+    full_prompt = f"Context:\n{context}\n\nQ: {prompt}\nA:"
+    result = gen_pipeline(full_prompt, max_length=300)[0]["generated_text"]
+    return f"{result}\n\nSources:\n" + "\n".join(sources)
 # ==============================
+# UI
 # ==============================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🔍 Research Assistant - Debug Mode")
     with gr.Row():
         with gr.Column():
+            file_in = gr.File(file_count="multiple")
+            urls_in = gr.Textbox(label="URLs", placeholder="https://...")
             ingest_btn = gr.Button("Ingest", variant="primary")
+            ingest_out = gr.Textbox(label="Debug Output", lines=10)
         with gr.Column():
+            prompt_in = gr.Textbox(label="Question", lines=3)
             ask_btn = gr.Button("Ask")
+            answer_out = gr.Textbox(lines=10)
+    ingest_btn.click(ingest_sources, [file_in, urls_in], ingest_out)
+    ask_btn.click(ask_prompt, prompt_in, answer_out)
 if __name__ == "__main__":
     demo.launch()