Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

2d0f5ab

verified ·

1 Parent(s): 26c15de

Update app.py

Browse files

Files changed (1) hide show

app.py +346 -196

app.py CHANGED Viewed

@@ -13,19 +13,11 @@ import faiss
 import numpy as np
 from transformers import pipeline
 import traceback
-# Try multiple PDF libraries
-try:
-    from PyPDF2 import PdfReader as PyPDF2Reader
-    HAS_PYPDF2 = True
-except ImportError:
-    HAS_PYPDF2 = False
-try:
-    import pdfplumber
-    HAS_PDFPLUMBER = True
-except ImportError:
-    HAS_PDFPLUMBER = False
 # ==============================
 # CONFIG
@@ -35,264 +27,422 @@ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
-embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 # ==============================
-# ROBUST PDF EXTRACTION
 # ==============================
-def extract_text_from_pdf_robust(file_path):
-    """Try multiple PDF extraction methods"""
-    methods_tried = []
-    # Method 1: pdfplumber (best for tables/forms)
-    if HAS_PDFPLUMBER:
-        try:
-            methods_tried.append("pdfplumber")
-            with pdfplumber.open(file_path) as pdf:
-                text = ""
-                for i, page in enumerate(pdf.pages):
-                    page_text = page.extract_text()
-                    if page_text:
-                        text += f"\n--- Page {i+1} ---\n{page_text}"
-                if len(text.strip()) > 50:
-                    print(f"pdfplumber success: {len(text)} chars")
-                    return text
-        except Exception as e:
-            print(f"pdfplumber failed: {e}")
-    # Method 2: pypdf (original)
     try:
-        methods_tried.append("pypdf")
         reader = PdfReader(file_path)
         text = ""
-        for i, page in enumerate(reader.pages):
-            page_text = page.extract_text()
-            if page_text and page_text.strip():
-                text += f"\n--- Page {i+1} ---\n{page_text}"
-        if len(text.strip()) > 50:
-            print(f"pypdf success: {len(text)} chars")
-            return text
-        print(f"pypdf extracted only {len(text)} chars")
-    except Exception as e:
-        print(f"pypdf failed: {e}")
-    # Method 3: PyPDF2 fallback
-    if HAS_PYPDF2:
-        try:
-            methods_tried.append("PyPDF2")
-            reader = PyPDF2Reader(file_path)
-            text = ""
-            for i, page in enumerate(reader.pages):
                 page_text = page.extract_text()
                 if page_text and page_text.strip():
-                    text += f"\n--- Page {i+1} ---\n{page_text}"
-            if len(text.strip()) > 50:
-                print(f"PyPDF2 success: {len(text)} chars")
-                return text
-        except Exception as e:
-            print(f"PyPDF2 failed: {e}")
-    # Method 4: Raw bytes check (detect encrypted/scanned)
-    try:
-        with open(file_path, 'rb') as f:
-            content = f.read(1024)
-            if b'/Encrypt' in content:
-                return "PDF is encrypted/protected. Please remove password."
-            if len(content) < 10000:
-                return "PDF appears to be scanned images (no text layer). Try OCR tools."
-    except:
-        pass
-    return f"No text extracted. Tried: {', '.join(methods_tried)}. Likely scanned PDF or protected."
-# ==============================
-# Other extractors (keep simple)
-# ==============================
 def extract_text_from_docx(file_path):
     try:
         doc = Document(file_path)
         paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
-        return "\n\n".join(paragraphs) if paragraphs else "No text in DOCX"
     except Exception as e:
         return f"DOCX error: {str(e)}"
 def extract_text_from_excel(file_path):
     try:
-        df = pd.read_excel(file_path, sheet_name=0, nrows=100)  # Limit rows
-        return f"Sheet preview:\n{df.fillna('').to_csv(index=False)}"
     except Exception as e:
         return f"Excel error: {str(e)}"
 def extract_text_from_txt(file_path):
-    try:
-        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-            return f.read()
-    except:
-        with open(file_path, "r", encoding="latin-1", errors="ignore") as f:
-            return f.read()
 def extract_text_from_url(url):
     try:
-        r = requests.get(url, timeout=10)
-        soup = BeautifulSoup(r.text, "html.parser")
-        for s in soup(["script", "style"]):
-            s.decompose()
-        text = soup.get_text(separator="\n", strip=True)
-        return text[:3000]  # Limit
     except Exception as e:
         return f"URL error: {str(e)}"
 # ==============================
-# Chunking
-# ==============================
-splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
-# ==============================
-# SIMPLIFIED INGESTION (focus on PDF fix)
 # ==============================
 def ingest_sources(files, urls):
     docs = []
     metadata = []
     debug_info = []
-    # Clear old index for testing
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
             os.remove(path)
-            debug_info.append(f"Cleared {path}")
-    processed = 0
     for f in files or []:
-        processed += 1
-        name = getattr(f, "name", f"file_{processed}")
-        debug_info.append(f"\n🔍 Processing: {name}")
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1])
         try:
-            # Handle Gradio file formats
-            data = None
-            if hasattr(f, 'read'):
-                data = f.read()
-                if isinstance(data, str): data = data.encode('utf-8')
-            elif isinstance(f, str):
-                data = f.encode('utf-8')
-            elif isinstance(f, dict) and 'data' in f:
-                data = f['data']
-                if isinstance(data, str): data = data.encode('utf-8')
-            if not data:
-                debug_info.append("❌ Could not read file data")
-                continue
-            tmp.write(data)
-            tmp.flush()
-            # Extract text
             ext = os.path.splitext(name.lower())[1]
             if ext == '.pdf':
-                text = extract_text_from_pdf_robust(tmp.name)
-            elif ext == '.docx':
-                text = extract_text_from_docx(tmp.name)
-            elif ext in ['.xls', '.xlsx']:
-                text = extract_text_from_excel(tmp.name)
             else:
-                text = extract_text_from_txt(tmp.name)
-            debug_info.append(f"📄 Extracted {len(text)} characters")
-            # Lower threshold for "valid" content
-            if len(text) > 20 and "error" not in text.lower() and "no text" not in text.lower():
                 chunks = splitter.split_text(text)
-                valid_chunks = [c for c in chunks if len(c.strip()) > 20]
                 for i, chunk in enumerate(valid_chunks):
                     docs.append(chunk)
                     metadata.append({
-                        "source": name,
-                        "chunk": i,
-                        "type": "file",
-                        "text": chunk
                     })
                 debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
             else:
-                debug_info.append(f"⚠️ Skipped: {'too short' if len(text) <= 20 else 'contains error'}")
         except Exception as e:
-            debug_info.append(f"💥 Error: {str(e)}")
-        finally:
-            try: os.unlink(tmp.name)
-            except: pass
-    # URLs (simplified)
-    for url in (urls or "").splitlines():
-        if url.strip():
-            text = extract_text_from_url(url.strip())
-            if len(text) > 100 and "error" not in text.lower():
-                chunks = splitter.split_text(text)
-                for i, c in enumerate(chunks):
-                    if len(c.strip()) > 20:
-                        docs.append(c)
-                        metadata.append({"source": url, "chunk": i, "type": "url", "text": c})
-    debug_info.append(f"\n📊 Total: {len(docs)} chunks created")
     if not docs:
-        return "❌ No valid content.\n\n" + "\n".join(debug_info)
-    # Build index
     try:
-        embeddings = embed_model.encode(docs, show_progress_bar=False)
-        index = faiss.IndexFlatL2(embeddings.shape[1])
-        index.add(embeddings)
         faiss.write_index(index, INDEX_PATH)
-        with open(METADATA_PATH, "w", encoding="utf-8") as f:
-            json.dump(metadata, f, ensure_ascii=False)
-        return f"✅ SUCCESS: {len(docs)} chunks ingested!\n\n" + "\n".join(debug_info[-5:])
     except Exception as e:
-        return f"❌ Index failed: {str(e)}\n\n" + "\n".join(debug_info)
 # ==============================
-# Keep retrieval and generation simple
 # ==============================
-def retrieve_topk(query, k=3):
-    if not os.path.exists(INDEX_PATH): return []
-    q_emb = embed_model.encode([query])
-    index = faiss.read_index(INDEX_PATH)
-    D, I = index.search(q_emb, k)
-    with open(METADATA_PATH, "r") as f:
-        metadata = json.load(f)
-    return [metadata[idx] for idx in I[0] if idx < len(metadata)]
-gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
-def ask_prompt(prompt):
-    hits = retrieve_topk(prompt)
-    if not hits: return "No documents ingested."
-    context = "\n\n".join([h.get("text", "")[:800] for h in hits])
-    sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
-    full_prompt = f"Context:\n{context}\n\nQ: {prompt}\nA:"
-    result = gen_pipeline(full_prompt, max_length=300)[0]["generated_text"]
-    return f"{result}\n\nSources:\n" + "\n".join(sources)
 # ==============================
-# UI
 # ==============================
-with gr.Blocks() as demo:
-    gr.Markdown("# 🔍 Research Assistant - Debug Mode")
-    with gr.Row():
-        with gr.Column():
-            file_in = gr.File(file_count="multiple")
-            urls_in = gr.Textbox(label="URLs", placeholder="https://...")
-            ingest_btn = gr.Button("Ingest", variant="primary")
-            ingest_out = gr.Textbox(label="Debug Output", lines=10)
-        with gr.Column():
-            prompt_in = gr.Textbox(label="Question", lines=3)
-            ask_btn = gr.Button("Ask")
-            answer_out = gr.Textbox(lines=10)
-    ingest_btn.click(ingest_sources, [file_in, urls_in], ingest_out)
-    ask_btn.click(ask_prompt, prompt_in, answer_out)
 if __name__ == "__main__":
-    demo.launch()

 import numpy as np
 from transformers import pipeline
 import traceback
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # ==============================
 # CONFIG
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
+# Global variables
+embed_model = None
+splitter = None
+gen_pipeline = None
+def initialize_models():
+    global embed_model, splitter, gen_pipeline
+    try:
+        embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+        splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
+        gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
+        logger.info("Models initialized successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Model initialization failed: {e}")
+        return False
+# Initialize on startup
+if not initialize_models():
+    raise RuntimeError("Failed to initialize models")
 # ==============================
+# FILE EXTRACTION FUNCTIONS
 # ==============================
+def extract_text_from_pdf(file_path):
+    """Extract text from PDF using pypdf"""
     try:
+        logger.info(f"Extracting PDF from: {file_path}")
         reader = PdfReader(file_path)
         text = ""
+        page_count = len(reader.pages)
+        # Extract from first few pages only for speed
+        for i, page in enumerate(reader.pages[:5]):
+            try:
                 page_text = page.extract_text()
                 if page_text and page_text.strip():
+                    text += f"\n--- Page {i+1}/{page_count} ---\n{page_text}\n"
+            except Exception as e:
+                logger.warning(f"Failed to extract page {i+1}: {e}")
+                continue
+        logger.info(f"PDF extraction complete: {len(text)} characters")
+        return text.strip()
+    except Exception as e:
+        logger.error(f"PDF extraction error: {e}")
+        return f"PDF extraction failed: {str(e)}"
 def extract_text_from_docx(file_path):
+    """Extract text from DOCX"""
     try:
         doc = Document(file_path)
         paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+        text = "\n\n".join(paragraphs)
+        logger.info(f"DOCX extraction: {len(paragraphs)} paragraphs")
+        return text
     except Exception as e:
+        logger.error(f"DOCX extraction error: {e}")
         return f"DOCX error: {str(e)}"
 def extract_text_from_excel(file_path):
+    """Extract text from Excel (first sheet preview)"""
     try:
+        df = pd.read_excel(file_path, sheet_name=0, nrows=50)  # Limit rows
+        text = f"Excel Sheet Preview ({df.shape[0]} rows):\n\n{df.fillna('').to_string(index=False)}"
+        logger.info(f"Excel extraction: {df.shape}")
+        return text
     except Exception as e:
+        logger.error(f"Excel extraction error: {e}")
         return f"Excel error: {str(e)}"
 def extract_text_from_txt(file_path):
+    """Extract text from plain text files"""
+    encodings = ['utf-8', 'latin-1', 'cp1252']
+    for encoding in encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+                return f.read()
+        except:
+            continue
+    return "Could not read text file with available encodings"
 def extract_text_from_url(url):
+    """Extract text from URL"""
     try:
+        headers = {'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'}
+        r = requests.get(url, timeout=15, headers=headers)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, 'html.parser')
+        for script in soup(["script", "style", "nav", "footer"]):
+            script.decompose()
+        text = soup.get_text(separator='\n', strip=True)
+        # Clean up excessive whitespace
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        return '\n'.join(lines[:100])  # Limit lines
     except Exception as e:
+        logger.error(f"URL extraction error: {e}")
         return f"URL error: {str(e)}"
 # ==============================
+# MAIN INGESTION FUNCTION
 # ==============================
 def ingest_sources(files, urls):
+    """Process files and URLs, create embeddings index"""
     docs = []
     metadata = []
     debug_info = []
+    # Clear existing index for fresh ingestion
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
             os.remove(path)
+            debug_info.append(f"🗑️ Cleared existing {os.path.basename(path)}")
+    # Process files
+    processed_files = 0
     for f in files or []:
+        processed_files += 1
         try:
+            # Get filename
+            name = getattr(f, 'name', f'file_{processed_files}')
+            if not name:
+                name = f'uploaded_file_{processed_files}'
+            debug_info.append(f"\n📁 Processing: {os.path.basename(name)}")
+            # Create temp file
+            suffix = os.path.splitext(name)[1] or '.txt'
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir='/tmp') as tmp:
+                # Handle different Gradio file formats
+                file_data = None
+                if hasattr(f, 'read'):
+                    file_data = f.read()
+                    if isinstance(file_data, str):
+                        file_data = file_data.encode('utf-8')
+                elif isinstance(f, dict) and 'data' in f:
+                    file_data = f['data']
+                    if isinstance(file_data, str):
+                        file_data = file_data.encode('utf-8')
+                elif isinstance(f, str):
+                    file_data = f.encode('utf-8')
+                if not file_data:
+                    debug_info.append("❌ No file data available")
+                    continue
+                tmp.write(file_data)
+                tmp_path = tmp.name
+                tmp.flush()
+            # Extract text based on extension
             ext = os.path.splitext(name.lower())[1]
+            text = ""
             if ext == '.pdf':
+                text = extract_text_from_pdf(tmp_path)
+            elif ext in ['.doc', '.docx']:
+                text = extract_text_from_docx(tmp_path)
+            elif ext in ['.xls', '.xlsx', '.csv']:
+                text = extract_text_from_excel(tmp_path)
             else:
+                text = extract_text_from_txt(tmp_path)
+            # Show preview of extracted content
+            preview = text[:200].replace('\n', ' ').strip()
+            if len(preview) > 100:
+                preview = preview[:100] + "..."
+            debug_info.append(f"📄 Extracted {len(text)} chars")
+            debug_info.append(f"🔍 Preview: '{preview}'")
+            # Create chunks if we have substantial content
+            if len(text.strip()) > 30 and not text.startswith(('error', 'PDF extraction failed')):
                 chunks = splitter.split_text(text)
+                valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
                 for i, chunk in enumerate(valid_chunks):
                     docs.append(chunk)
                     metadata.append({
+                        "source": os.path.basename(name),
+                        "chunk_id": i,
+                        "type": "file",
+                        "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
                     })
                 debug_info.append(f"✅ Created {len(valid_chunks)} chunks")
             else:
+                debug_info.append("⚠️ Skipped: insufficient content or extraction error")
+            # Cleanup
+            try:
+                os.unlink(tmp_path)
+            except:
+                pass
         except Exception as e:
+            debug_info.append(f"💥 Error processing file: {str(e)}")
+            logger.error(f"File processing error: {e}", exc_info=True)
+    # Process URLs
+    if urls and urls.strip():
+        debug_info.append(f"\n🌐 Processing URLs:")
+        for url_line in urls.strip().split('\n'):
+            url = url_line.strip()
+            if url.startswith('http'):
+                debug_info.append(f"  📡 {url}")
+                text = extract_text_from_url(url)
+                if len(text.strip()) > 100 and not text.startswith('URL error'):
+                    chunks = splitter.split_text(text)
+                    valid_chunks = [c.strip() for c in chunks if len(c.strip()) > 20]
+                    for i, chunk in enumerate(valid_chunks):
+                        docs.append(chunk)
+                        metadata.append({
+                            "source": url,
+                            "chunk_id": i,
+                            "type": "url",
+                            "content_preview": chunk[:100] + "..." if len(chunk) > 100 else chunk
+                        })
+                    debug_info.append(f"  ✅ Created {len(valid_chunks)} chunks from URL")
+                else:
+                    debug_info.append(f"  ⚠️ URL skipped: insufficient content")
+    debug_info.append(f"\n📊 SUMMARY: {len(docs)} total chunks created")
     if not docs:
+        return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-15:])
+    # Create FAISS index
     try:
+        debug_info.append("🔄 Creating embeddings and index...")
+        embeddings = embed_model.encode(docs, show_progress_bar=False, convert_to_numpy=True)
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings.astype('float32'))
+        # Save index and metadata
         faiss.write_index(index, INDEX_PATH)
+        with open(METADATA_PATH, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+        debug_info.append(f"✅ Index created successfully: {embeddings.shape[0]} vectors")
+        return f"🎉 SUCCESS! Ingested {len(docs)} chunks from {processed_files} files.\n\n" + "\n".join(debug_info[-8:])
+    except Exception as e:
+        debug_info.append(f"💥 Index creation failed: {str(e)}")
+        logger.error(f"Index creation error: {e}", exc_info=True)
+        return f"❌ Indexing failed: {str(e)}\n\n" + "\n".join(debug_info[-10:])
+# ==============================
+# RETRIEVAL
+# ==============================
+def retrieve_topk(query, k=5):
+    """Retrieve top k relevant chunks"""
+    try:
+        if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
+            return []
+        query_embedding = embed_model.encode([query], convert_to_numpy=True)
+        index = faiss.read_index(INDEX_PATH)
+        distances, indices = index.search(query_embedding.astype('float32'), k)
+        with open(METADATA_PATH, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        results = []
+        for i, idx in enumerate(indices[0]):
+            if idx < len(metadata):
+                results.append({
+                    **metadata[idx],
+                    "distance": float(distances[0][i])
+                })
+        return results[:k]
     except Exception as e:
+        logger.error(f"Retrieval error: {e}")
+        return []
 # ==============================
+# GENERATION
 # ==============================
+def ask_prompt(query):
+    """Generate answer based on retrieved context"""
+    try:
+        hits = retrieve_topk(query, k=3)
+        if not hits:
+            return "No relevant documents found. Please ingest some files first."
+        # Build context from top hits
+        context_parts = []
+        sources = []
+        for hit in hits:
+            content = hit.get('content_preview', '') or ''
+            if len(content) > 50:
+                context_parts.append(content)
+                source_info = f"{hit['source']} (chunk {hit['chunk_id']})"
+                if hit.get('distance'):
+                    source_info += f" [relevance: {hit['distance']:.3f}]"
+                sources.append(source_info)
+        if not context_parts:
+            return "Retrieved documents but no content available."
+        context = "\n\n".join(context_parts)
+        full_prompt = f"""Based on the following context, answer the question.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        # Generate response
+        result = gen_pipeline(
+            full_prompt,
+            max_length=400,
+            min_length=50,
+            do_sample=False,
+            temperature=0.1
+        )[0]['generated_text']
+        # Extract just the answer part
+        if "Answer:" in result:
+            answer = result.split("Answer:", 1)[1].strip()
+        else:
+            answer = result
+        response = f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
+        return response
+    except Exception as e:
+        logger.error(f"Generation error: {e}")
+        return f"Error generating response: {str(e)}"
 # ==============================
+# GRADIO UI
 # ==============================
+def create_ui():
+    with gr.Blocks(title="Research Assistant", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🔍 Research Assistant
+        Upload documents and ask questions about their content.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📤 Document Ingestion")
+                file_input = gr.File(
+                    label="Upload Files",
+                    file_count="multiple",
+                    file_types=[".pdf", ".docx", ".doc", ".txt", ".xlsx", ".xls", ".csv"]
+                )
+                url_input = gr.Textbox(
+                    label="Or paste URLs (one per line)",
+                    placeholder="https://example.com/document\nhttps://another-site.com/page",
+                    lines=3
+                )
+                ingest_button = gr.Button("🚀 Ingest Documents", variant="primary", size="lg")
+                status_output = gr.Textbox(
+                    label="Ingestion Status",
+                    lines=12,
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### ❓ Ask Questions")
+                query_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="What does the document say about...",
+                    lines=3
+                )
+                ask_button = gr.Button("💬 Get Answer", variant="secondary")
+                answer_output = gr.Textbox(
+                    label="Answer",
+                    lines=12,
+                    interactive=False
+                )
+        # Event handlers
+        ingest_button.click(
+            ingest_sources,
+            inputs=[file_input, url_input],
+            outputs=status_output
+        )
+        ask_button.click(
+            ask_prompt,
+            inputs=query_input,
+            outputs=answer_output
+        )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["What is the main topic of the documents?"],
+                ["Summarize the key points."],
+                ["What are the dates mentioned?"],
+            ],
+            inputs=query_input
+        )
+    return demo
+# ==============================
+# MAIN
+# ==============================
 if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Set to True for public sharing
+        debug=True
+    )