Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

6b8b552

verified ·

1 Parent(s): 18ef8c7

Update app.py

Browse files

Files changed (1) hide show

app.py +243 -302

app.py CHANGED Viewed

@@ -11,11 +11,9 @@ import faiss
 import numpy as np
 from transformers import pipeline
 import logging
-import subprocess
-import shutil
-import re
-# PDF libraries with fallbacks
 try:
     from pypdf import PdfReader
     HAS_PYPDF = True
@@ -28,8 +26,6 @@ try:
 except:
     HAS_PDFPLUMBER = False
-HAS_PDFTOTEXT = shutil.which('pdftotext') is not None
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -42,362 +38,307 @@ INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
 # Initialize models
-try:
-    embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
-    gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
-    logger.info("Models loaded successfully")
-except Exception as e:
-    logger.error(f"Model loading failed: {e}")
-    raise
 # ==============================
-# SIMPLE TEXT SPLITTER (NO LANGCHAIN)
 # ==============================
 def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
-    """Simple recursive text splitter without langchain"""
     if len(text) <= chunk_size:
         return [text.strip()]
     chunks = []
     start = 0
     while start < len(text):
-        end = start + chunk_size
-        # Try to split at sentence boundaries
-        if end < len(text):
-            # Look for sentence endings near chunk boundary
-            for boundary in [end-50, end-20, end]:
-                if boundary < len(text):
-                    # Find sentence breaks
-                    sentence_end = text.rfind('.', 0, boundary)
-                    sentence_end = max(sentence_end, text.rfind('!', 0, boundary))
-                    sentence_end = max(sentence_end, text.rfind('?', 0, boundary))
-                    sentence_end = max(sentence_end, text.rfind('\n\n', 0, boundary))
-                    if sentence_end > start + chunk_overlap:
-                        end = sentence_end + 1
-                        break
         chunk = text[start:end].strip()
-        if len(chunk) > 50:  # Only add substantial chunks
             chunks.append(chunk)
         start = end - chunk_overlap
-    return chunks
 # ==============================
-# ROBUST PDF EXTRACTION
 # ==============================
-def extract_text_from_pdf_simple(file_path):
-    """Try multiple methods to extract PDF text"""
-    text = ""
-    # Method 1: pdftotext (most reliable)
-    if HAS_PDFTOTEXT:
-        try:
-            result = subprocess.run(
-                ['pdftotext', '-layout', file_path, '-'],
-                capture_output=True, text=True, timeout=20
-            )
-            if result.returncode == 0 and len(result.stdout.strip()) > 20:
-                return result.stdout.strip()
-        except Exception as e:
-            logger.warning(f"pdftotext failed: {e}")
-    # Method 2: pdfplumber
-    if HAS_PDFPLUMBER:
-        try:
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages[:5]:  # First 5 pages
-                    page_text = page.extract_text()
-                    if page_text:
-                        text += page_text + "\n\n"
-            if len(text.strip()) > 50:
-                return text.strip()
-        except Exception as e:
-            logger.warning(f"pdfplumber failed: {e}")
-    # Method 3: pypdf with aggressive error handling
-    if HAS_PYPDF:
-        try:
-            reader = PdfReader(file_path)
-            for page in reader.pages[:3]:  # First 3 pages only
-                try:
-                    # Multiple extraction attempts
-                    page_text = None
-                    if hasattr(page, 'extract_text'):
-                        page_text = page.extract_text()
-                    elif hasattr(page, 'extractText'):
-                        page_text = page.extractText()
-                    if page_text and len(page_text.strip()) > 10:
-                        text += page_text + "\n\n"
-                except:
-                    continue  # Skip problematic pages
-            if len(text.strip()) > 50:
-                return text.strip()
-        except Exception as e:
-            logger.warning(f"pypdf failed: {e}")
-    return f"PDF extraction failed - likely scanned images or corrupted file (size: {os.path.getsize(file_path)} bytes)"
 # ==============================
-# OTHER FILE EXTRACTION
 # ==============================
-def extract_text_from_docx(file_path):
     try:
-        doc = Document(file_path)
-        return "\n\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
-    except:
-        return "DOCX extraction failed"
-def extract_text_from_excel(file_path):
     try:
-        df = pd.read_excel(file_path, nrows=50)
-        return df.to_string(index=False)
-    except:
-        return "Excel extraction failed"
-def extract_text_from_txt(file_path):
-    encodings = ['utf-8', 'latin-1', 'cp1252']
-    for enc in encodings:
-        try:
-            with open(file_path, 'r', encoding=enc, errors='ignore') as f:
-                content = f.read().strip()
-                if len(content) > 10:
-                    return content
-        except:
-            continue
-    return "Text file reading failed"
-def extract_text_from_url(url):
     try:
-        headers = {'User-Agent': 'Mozilla/5.0'}
-        r = requests.get(url, headers=headers, timeout=10)
-        soup = BeautifulSoup(r.text, 'html.parser')
-        for tag in soup(['script', 'style']):
-            tag.decompose()
-        text = soup.get_text(separator='\n', strip=True)
-        return ' '.join(text.split())[:3000]
     except:
-        return "URL extraction failed"
 # ==============================
-# MAIN INGESTION FUNCTION
 # ==============================
 def ingest_sources(files, urls=""):
     docs = []
     metadata = []
     debug_info = []
-    # Clear existing index
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
-            try:
-                os.remove(path)
-                debug_info.append(f"🗑️ Cleared {os.path.basename(path)}")
-            except:
-                pass
     # Process files
-    for i, f in enumerate(files or []):
-        try:
-            name = getattr(f, 'name', f'file_{i+1}')
-            debug_info.append(f"\n📄 Processing: {os.path.basename(name) if name else 'Unknown'}")
-            # Create temp file
-            ext = '.txt'
-            if name:
-                ext = os.path.splitext(name)[1] or '.txt'
-            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
-                # Read file data
-                data = None
-                if hasattr(f, 'read'):
-                    data = f.read()
-                    if isinstance(data, str):
-                        data = data.encode('utf-8')
-                elif isinstance(f, dict) and 'data' in f:
-                    data = f['data']
-                if not data:
-                    debug_info.append("❌ No file data")
-                    continue
-                tmp.write(data)
-                tmp_path = tmp.name
-            # Extract text
-            file_ext = os.path.splitext(name.lower())[1] if name else ''
-            text = ""
-            if file_ext == '.pdf':
-                text = extract_text_from_pdf_simple(tmp_path)
-            elif file_ext in ['.docx', '.doc']:
-                text = extract_text_from_docx(tmp_path)
-            elif file_ext in ['.xlsx', '.xls', '.csv']:
-                text = extract_text_from_excel(tmp_path)
-            else:
-                text = extract_text_from_txt(tmp_path)
-            # Debug preview
-            preview = text[:150].replace('\n', ' ').strip()
-            if len(preview) > 100:
-                preview = preview[:100] + "..."
-            debug_info.append(f"📊 Extracted {len(text)} characters")
-            debug_info.append(f"🔍 Preview: '{preview}'")
-            # Create chunks if we have content
-            if len(text.strip()) > 30 and not text.startswith(('PDF extraction failed', 'extraction failed')):
-                chunks = simple_text_splitter(text)
-                valid_chunks = [c for c in chunks if len(c.strip()) > 20]
-                for j, chunk in enumerate(valid_chunks):
-                    docs.append(chunk)
-                    metadata.append({
-                        "source": os.path.basename(name) if name else f"file_{i+1}",
-                        "chunk": j,
-                        "text": chunk
-                    })
-                debug_info.append(f"�� Created {len(valid_chunks)} chunks")
-            else:
-                debug_info.append("⚠️ Skipped: insufficient content or extraction error")
-            # Cleanup
-            try:
-                os.unlink(tmp_path)
-            except:
-                pass
-        except Exception as e:
-            debug_info.append(f"💥 Error: {str(e)}")
-            logger.error(f"File processing error: {e}")
-    # Process URLs
-    for url in urls.strip().split('\n'):
-        url = url.strip()
-        if url.startswith('http'):
-            text = extract_text_from_url(url)
-            if len(text.strip()) > 100:
-                chunks = simple_text_splitter(text)
-                for j, chunk in enumerate(chunks):
-                    if len(chunk.strip()) > 20:
-                        docs.append(chunk)
-                        metadata.append({
-                            "source": url,
-                            "chunk": j,
-                            "text": chunk,
-                            "type": "url"
-                        })
-    debug_info.append(f"\n📈 TOTAL: {len(docs)} chunks created")
-    if not docs:
-        return "❌ No valid content extracted.\n\n" + "\n".join(debug_info[-12:])
-    # Create FAISS index
-    try:
-        debug_info.append("🔄 Creating embeddings...")
-        embeddings = embed_model.encode(docs, show_progress_bar=False)
-        dimension = embeddings.shape[1]
-        index = faiss.IndexFlatL2(dimension)
-        index.add(embeddings.astype('float32'))
-        faiss.write_index(index, INDEX_PATH)
-        with open(METADATA_PATH, 'w', encoding='utf-8') as f:
-            json.dump(metadata, f, ensure_ascii=False, indent=2)
-        debug_info.append("✅ Index created successfully!")
-        return f"🎉 SUCCESS! Ingested {len(docs)} chunks.\n\n" + "\n".join(debug_info[-6:])
-    except Exception as e:
-        debug_info.append(f"💥 Indexing failed: {str(e)}")
-        return f"❌ Indexing error: {str(e)}\n\n" + "\n".join(debug_info[-6:])
 # ==============================
 # RETRIEVAL & GENERATION
 # ==============================
 def retrieve_topk(query, k=3):
-    try:
-        if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
-            return []
-        q_emb = embed_model.encode([query], show_progress_bar=False)
-        index = faiss.read_index(INDEX_PATH)
-        distances, indices = index.search(q_emb.astype('float32'), k)
-        with open(METADATA_PATH, 'r', encoding='utf-8') as f:
-            metadata = json.load(f)
-        return [metadata[i] for i in indices[0] if i < len(metadata)]
-    except:
         return []
 def ask_prompt(query):
     hits = retrieve_topk(query)
     if not hits:
-        return "No documents ingested or no relevant matches found."
-    context = "\n\n".join([h.get('text', '')[:800] for h in hits])
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
-    full_prompt = f"""Context:
-{context}
-Question: {query}
-Answer:"""
-    try:
-        result = gen_pipeline(full_prompt, max_length=300, do_sample=False)[0]['generated_text']
-        answer = result.split('Answer:')[-1].strip() if 'Answer:' in result else result
-        return f"{answer}\n\n**Sources:**\n" + "\n".join(sources)
-    except Exception as e:
-        return f"Generation error: {str(e)}"
 # ==============================
-# GRADIO UI
 # ==============================
-with gr.Blocks(title="Document QA", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🔍 Document Q&A Assistant")
-    gr.Markdown("Upload PDFs, DOCX, Excel files or URLs to create a searchable knowledge base.")
     with gr.Row():
-        with gr.Column(scale=1):
-            file_input = gr.File(file_count="multiple", label="Upload Files")
-            url_input = gr.Textbox(label="URLs (one per line)", lines=2)
-            ingest_btn = gr.Button("🚀 Ingest Documents", variant="primary")
-            status_output = gr.Textbox(label="Ingestion Status", lines=12)
-        with gr.Column(scale=1):
-            query_input = gr.Textbox(label="Ask a question about your documents", lines=3)
-            ask_btn = gr.Button("💬 Get Answer", variant="secondary")
-            answer_output = gr.Textbox(label="Answer", lines=10)
-    # Events
-    ingest_btn.click(
-        ingest_sources,
-        inputs=[file_input, url_input],
-        outputs=status_output
-    )
-    ask_btn.click(
-        ask_prompt,
-        inputs=query_input,
-        outputs=answer_output
-    )
-    gr.Markdown("### Tips:")
-    gr.Markdown("""
-    - **PDFs**: Works best with searchable PDFs (not scanned images)
-    - **Scanned PDFs**: Convert to searchable text first using Adobe Acrobat or online OCR
-    - **Large files**: Processing may take 1-2 minutes
-    - **Test first**: Try with a simple text file to verify setup
-    """)
 if __name__ == "__main__":
     demo.launch()

 import numpy as np
 from transformers import pipeline
 import logging
+import io
+# PDF libraries
 try:
     from pypdf import PdfReader
     HAS_PYPDF = True
 except:
     HAS_PDFPLUMBER = False
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 METADATA_PATH = "metadata.json"
 # Initialize models
+embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 # ==============================
+# SIMPLE TEXT SPLITTER
 # ==============================
 def simple_text_splitter(text, chunk_size=1000, chunk_overlap=100):
     if len(text) <= chunk_size:
         return [text.strip()]
     chunks = []
     start = 0
     while start < len(text):
+        end = min(start + chunk_size, len(text))
         chunk = text[start:end].strip()
+        if len(chunk) > 50:
             chunks.append(chunk)
         start = end - chunk_overlap
+    return [c for c in chunks if len(c) > 20]
 # ==============================
+# CORRECTED FILE HANDLING FOR GRADIO
 # ==============================
+def get_file_data(file_obj):
+    """Handle different Gradio file formats correctly"""
+    debug = []
+    # Method 1: File has .name attribute (temp file path)
+    if hasattr(file_obj, 'name') and file_obj.name:
+        debug.append(f"Using file path: {file_obj.name}")
+        return file_obj.name, "path"
+    # Method 2: File has .data attribute (base64 or bytes)
+    if hasattr(file_obj, 'data') and file_obj.data:
+        debug.append(f"Using file.data: {len(file_obj.data)} bytes")
+        return file_obj.data, "bytes"
+    # Method 3: Try to read as bytes
+    try:
+        if hasattr(file_obj, 'read'):
+            file_obj.seek(0)  # Reset file pointer
+            data = file_obj.read()
+            if data:
+                debug.append(f"Read {len(data)} bytes from file object")
+                return data, "read"
+    except Exception as e:
+        debug.append(f"Read failed: {e}")
+    # Method 4: Check if it's a dict with content
+    if isinstance(file_obj, dict):
+        if 'data' in file_obj and file_obj['data']:
+            debug.append(f"Using dict data: {len(file_obj['data'])} bytes")
+            return file_obj['data'], "dict"
+        if 'name' in file_obj and file_obj['name']:
+            debug.append(f"Using dict path: {file_obj['name']}")
+            return file_obj['name'], "dict_path"
+    # Method 5: String path
+    if isinstance(file_obj, str) and os.path.exists(file_obj):
+        debug.append(f"Using string path: {file_obj}")
+        return file_obj, "string_path"
+    debug.append("❌ No valid file data found")
+    return None, debug
 # ==============================
+# PDF EXTRACTION
 # ==============================
+def extract_pdf_text(file_data, source_type, debug_info):
+    """Extract text from PDF using multiple methods"""
+    temp_path = None
     try:
+        # If we have a file path, use it directly
+        if source_type in ["path", "string_path", "dict_path"]:
+            file_path = file_data
+            if not os.path.exists(file_path):
+                debug_info.append(f"❌ File path doesn't exist: {file_path}")
+                return "File not found"
+            # Try pdftotext first (if available)
+            try:
+                import subprocess
+                result = subprocess.run(['pdftotext', file_path, '-'],
+                                      capture_output=True, text=True, timeout=15)
+                if result.returncode == 0 and len(result.stdout.strip()) > 30:
+                    debug_info.append(f"✅ pdftotext: {len(result.stdout)} chars")
+                    return result.stdout
+            except:
+                pass
+        # Create temp file from bytes
+        if source_type in ["bytes", "read", "dict"]:
+            temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf').name
+            with open(temp_path, 'wb') as f:
+                if isinstance(file_data, str):
+                    f.write(file_data.encode('latin1'))  # PDFs are binary
+                else:
+                    f.write(file_data)
+            file_path = temp_path
+            debug_info.append(f"Created temp file: {temp_path}")
+        # Try pdfplumber
+        if HAS_PDFPLUMBER:
+            try:
+                with pdfplumber.open(file_path) as pdf:
+                    text = ""
+                    for i, page in enumerate(pdf.pages[:5]):
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                    if len(text.strip()) > 50:
+                        debug_info.append(f"✅ pdfplumber: {len(text)} chars")
+                        return text
+            except Exception as e:
+                debug_info.append(f"pdfplumber failed: {e}")
+        # Try pypdf
+        if HAS_PYPDF:
+            try:
+                reader = PdfReader(file_path)
+                text = ""
+                for i, page in enumerate(reader.pages[:3]):
+                    try:
+                        page_text = page.extract_text()
+                        if page_text and page_text.strip():
+                            text += page_text + "\n"
+                    except:
+                        continue
+                if len(text.strip()) > 30:
+                    debug_info.append(f"✅ pypdf: {len(text)} chars")
+                    return text
+            except Exception as e:
+                debug_info.append(f"pypdf failed: {e}")
+        return "No text extracted - likely scanned PDF images"
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+# ==============================
+# OTHER EXTRACTIONS
+# ==============================
+def extract_docx_text(file_data, source_type, debug_info):
     try:
+        if source_type == "path":
+            doc = Document(file_data)
+        else:
+            # Write to temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
+                if isinstance(file_data, bytes):
+                    tmp.write(file_data)
+                tmp_path = tmp.name
+            doc = Document(tmp_path)
+            os.unlink(tmp_path)
+        text = "\n\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
+        if len(text) > 20:
+            return text
+        return "No text in DOCX"
+    except Exception as e:
+        return f"DOCX error: {e}"
+def extract_text_file(file_data, source_type, debug_info):
     try:
+        if source_type == "path":
+            with open(file_data, 'r', encoding='utf-8', errors='ignore') as f:
+                return f.read()
+        else:
+            # Decode bytes
+            if isinstance(file_data, bytes):
+                return file_data.decode('utf-8', errors='ignore')
+            return str(file_data)
     except:
+        return "Text extraction failed"
 # ==============================
+# MAIN INGESTION
 # ==============================
 def ingest_sources(files, urls=""):
     docs = []
     metadata = []
     debug_info = []
+    # Clear existing
     for path in [INDEX_PATH, METADATA_PATH]:
         if os.path.exists(path):
+            os.remove(path)
     # Process files
+    for i, file_obj in enumerate(files or []):
+        debug_info.append(f"\n📄 Processing file {i+1}")
+        # Get file data correctly
+        file_data, source_info = get_file_data(file_obj)
+        if isinstance(source_info, list):
+            debug_info.extend(source_info)
+            continue
+        if not file_data:
+            debug_info.append("❌ No file data")
+            continue
+        # Get filename and extension
+        filename = getattr(file_obj, 'name', f'file_{i+1}')
+        if isinstance(filename, bytes):
+            filename = filename.decode('utf-8', errors='ignore')
+        ext = os.path.splitext(filename.lower())[1] if filename else ''
+        debug_info.append(f"File: {filename}, Type: {source_info}")
+        # Extract text
+        text = ""
+        if ext == '.pdf':
+            text = extract_pdf_text(file_data, source_info, debug_info)
+        elif ext in ['.docx', '.doc']:
+            text = extract_docx_text(file_data, source_info, debug_info)
+        elif ext in ['.txt', '.md']:
+            text = extract_text_file(file_data, source_info, debug_info)
+        else:
+            debug_info.append(f"Unknown extension: {ext}")
+            continue
+        # Preview
+        preview = text[:100].replace('\n', ' ').strip()
+        if len(preview) > 80:
+            preview = preview[:80] + "..."
+        debug_info.append(f"Extracted {len(text)} chars")
+        debug_info.append(f"Preview: '{preview}'")
+        # Create chunks
+        if len(text.strip()) > 30:
+            chunks = simple_text_splitter(text)
+            for j, chunk in enumerate(chunks):
+                docs.append(chunk)
+                metadata.append({
+                    "source": filename,
+                    "chunk": j,
+                    "text": chunk
+                })
+            debug_info.append(f"✅ {len(chunks)} chunks created")
+        else:
+            debug_info.append("⚠️ Insufficient content")
+    debug_info.append(f"\n📊 Total: {len(docs)} chunks")
+    if docs:
+        embeddings = embed_model.encode(docs)
+        index = faiss.IndexFlatL2(embeddings.shape[1])
+        index.add(embeddings)
+        faiss.write_index(index, INDEX_PATH)
+        with open(METADATA_PATH, 'w') as f:
+            json.dump(metadata, f)
+        return f"✅ SUCCESS: {len(docs)} chunks!"
+    return "❌ No content.\n\n" + "\n".join(debug_info[-15:])
 # ==============================
 # RETRIEVAL & GENERATION
 # ==============================
 def retrieve_topk(query, k=3):
+    if not os.path.exists(INDEX_PATH):
         return []
+    q_emb = embed_model.encode([query])
+    index = faiss.read_index(INDEX_PATH)
+    D, I = index.search(q_emb, k)
+    with open(METADATA_PATH, 'r') as f:
+        metadata = json.load(f)
+    return [metadata[i] for i in I[0] if i < len(metadata)]
 def ask_prompt(query):
     hits = retrieve_topk(query)
     if not hits:
+        return "No documents found."
+    context = "\n\n".join([h['text'][:600] for h in hits])
+    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
+    result = gen_pipeline(prompt, max_length=300)[0]['generated_text']
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
+    return f"{result}\n\nSources:\n" + "\n".join(sources)
 # ==============================
+# UI
 # ==============================
+with gr.Blocks() as demo:
+    gr.Markdown("# 🔍 Document QA")
     with gr.Row():
+        with gr.Column():
+            file_input = gr.File(file_count="multiple")
+            ingest_btn = gr.Button("Ingest", variant="primary")
+            status = gr.Textbox(lines=15)
+        with gr.Column():
+            query_input = gr.Textbox(label="Question")
+            ask_btn = gr.Button("Ask")
+            answer = gr.Textbox(lines=10)
+    ingest_btn.click(ingest_sources, [file_input, gr.State("")], status)
+    ask_btn.click(ask_prompt, query_input, answer)
 if __name__ == "__main__":
     demo.launch()