Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

340c03d

verified ·

1 Parent(s): 2ef34b3

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -91

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import os
 import tempfile
 import gradio as gr
@@ -7,7 +6,7 @@ import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 from pypdf import PdfReader
-import docx
 from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import faiss
@@ -17,8 +16,8 @@ from transformers import pipeline
 # ==============================
 # CONFIG
 # ==============================
-HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")
-EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
@@ -31,34 +30,46 @@ embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 # Helper text extractors
 # ==============================
 def extract_text_from_pdf(file_path):
-    reader = PdfReader(file_path)
-    pages = [p.extract_text() or "" for p in reader.pages]
-    return "\n\n".join(pages)
 def extract_text_from_docx(file_path):
-    doc = docx.Document(file_path)
-    return "\n\n".join(p.text for p in doc.paragraphs)
 def extract_text_from_excel(file_path):
-    df_dict = pd.read_excel(file_path, sheet_name=None)
-    out = []
-    for sheet, df in df_dict.items():
-        out.append(f"Sheet: {sheet}")
-        out.append(df.fillna("").to_csv(index=False))
-    return "\n\n".join(out)
 def extract_text_from_url(url):
-    r = requests.get(url, timeout=10)
-    soup = BeautifulSoup(r.text, "lxml")
-    for s in soup(["script", "style", "aside", "nav", "footer"]):
-        s.decompose()
-    text = soup.get_text(separator="\n")
-    return text
 # ==============================
-# Text chunking setup
 # ==============================
-splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
 # ==============================
 # Ingestion function
@@ -67,29 +78,50 @@ def ingest_sources(files, urls):
     docs = []
     metadata = []
     # Handle uploaded files
-    for f in files:
-        name = f.name
         tmp = tempfile.NamedTemporaryFile(delete=False)
         try:
             if hasattr(f, "read"):
-                tmp.write(f.read())
-            else:
-                tmp.write(f.encode("utf-8"))
             tmp.flush()
             tmp.close()
-            if name.lower().endswith(".pdf"):
                 text = extract_text_from_pdf(tmp.name)
-            elif name.lower().endswith(".docx"):
                 text = extract_text_from_docx(tmp.name)
-            elif name.lower().endswith((".xls", ".xlsx")):
                 text = extract_text_from_excel(tmp.name)
             else:
                 with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
                     text = fh.read()
         finally:
-            os.unlink(tmp.name)
         chunks = splitter.split_text(text)
         for i, c in enumerate(chunks):
@@ -97,37 +129,43 @@ def ingest_sources(files, urls):
             metadata.append({"source": name, "chunk": i, "type": "file", "text": c})
     # Handle URLs
-    for u in urls:
-        if not u.strip():
             continue
-        try:
-            text = extract_text_from_url(u)
-            chunks = splitter.split_text(text)
-            for i, c in enumerate(chunks):
-                docs.append(c)
-                metadata.append({"source": u, "chunk": i, "type": "url", "text": c})
-        except Exception as e:
-            print("URL error:", u, e)
     if not docs:
-        return "No text extracted from files or URLs."
-    embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
-    dim = embeddings.shape[1]
     if os.path.exists(INDEX_PATH):
         index = faiss.read_index(INDEX_PATH)
         old_meta = json.load(open(METADATA_PATH, "r", encoding="utf-8"))
         index.add(embeddings)
         old_meta.extend(metadata)
-        json.dump(old_meta, open(METADATA_PATH, "w", encoding="utf-8"))
     else:
         index = faiss.IndexFlatL2(dim)
         index.add(embeddings)
-        json.dump(metadata, open(METADATA_PATH, "w", encoding="utf-8"))
     faiss.write_index(index, INDEX_PATH)
-    return f"Ingested {len(docs)} text chunks from {len(files)} files and {len(urls)} URLs."
 # ==============================
 # Retrieve top matching chunks
@@ -138,84 +176,64 @@ def retrieve_topk(query, k=5):
     q_emb = embed_model.encode([query], convert_to_numpy=True)
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
-    metadata = json.load(open(METADATA_PATH, "r", encoding="utf-8"))
-    results = []
-    for idx in I[0]:
-        if idx < len(metadata):
-            results.append(metadata[idx])
     return results
 # ==============================
-# Generation pipeline
 # ==============================
 gen_pipeline = pipeline(
     "text2text-generation",
     model=HF_GENERATION_MODEL,
-    device=0 if os.environ.get("HF_DEVICE", "cpu") != "cpu" else -1,
 )
 # ==============================
 # Ask prompt
 # ==============================
 def ask_prompt(prompt, top_k=5):
-    if not os.path.exists(INDEX_PATH) or not os.path.exists(METADATA_PATH):
-        return "No documents ingested yet."
     hits = retrieve_topk(prompt, k=top_k)
     if not hits:
-        return "No relevant context found. Try ingesting more content."
-    # Collect context text
-    context_parts = [h["text"] for h in hits if "text" in h]
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
-    context = "\n\n".join(context_parts)
-    if not context.strip():
-        return "No readable text found in the ingested files."
     system_instruction = (
-        "You are a helpful research assistant. Read the provided context carefully "
-        "and answer the question accurately and concisely."
     )
-    full_prompt = f"{system_instruction}\n\nCONTEXT:\n{context}\n\nQUESTION:\n{prompt}\n\nAnswer:"
     try:
-        out = gen_pipeline(full_prompt, max_length=400, do_sample=False)[0]["generated_text"]
     except Exception as e:
-        return f"Model generation failed: {e}"
-    return out + "\n\nSources:\n" + "\n".join(sources)
 # ==============================
 # Gradio UI
 # ==============================
 with gr.Blocks() as demo:
-    gr.Markdown(
-        "# 🧠 Research Assistant (Prototype)\nUpload files or paste URLs, click **Ingest**, then ask your question."
-    )
     with gr.Row():
         with gr.Column():
-            file_in = gr.File(
-                label="Upload files (pdf/docx/xlsx/txt)", file_count="multiple"
-            )
-            urls_in = gr.Textbox(
-                label="URLs (one per line)",
-                placeholder="https://example.com/article",
-            )
             ingest_btn = gr.Button("Ingest")
-            ingest_output = gr.Textbox(label="Ingest status")
         with gr.Column():
-            prompt_in = gr.Textbox(label="Your question", lines=4)
             ask_btn = gr.Button("Ask")
             answer_out = gr.Textbox(label="Answer", lines=12)
-    ingest_btn.click(
-        lambda files, urls: ingest_sources(files or [], (urls or "").splitlines()),
-        inputs=[file_in, urls_in],
-        outputs=ingest_output,
-    )
-    ask_btn.click(lambda p: ask_prompt(p, top_k=5), inputs=prompt_in, outputs=answer_out)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import tempfile
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from pypdf import PdfReader
+from docx import Document  # Use Document for docx extraction
 from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import faiss
 # ==============================
 # CONFIG
 # ==============================
+HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")  # Swap to "deepseek-ai/DeepSeek-V3" if using HF Inference or GPU
+EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # Lighter for CPU
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
 # Helper text extractors
 # ==============================
 def extract_text_from_pdf(file_path):
+    try:
+        reader = PdfReader(file_path)
+        pages = [p.extract_text() or "" for p in reader.pages]
+        return "\n\n".join(pages)
+    except Exception as e:
+        return f"PDF extraction error: {str(e)}"
 def extract_text_from_docx(file_path):
+    try:
+        doc = Document(file_path)
+        return "\n\n".join(p.text for p in doc.paragraphs)
+    except Exception as e:
+        return f"DOCX extraction error: {str(e)}"
 def extract_text_from_excel(file_path):
+    try:
+        df_dict = pd.read_excel(file_path, sheet_name=None)
+        out = []
+        for sheet, df in df_dict.items():
+            out.append(f"Sheet: {sheet}")
+            out.append(df.fillna("").to_csv(index=False))
+        return "\n\n".join(out)
+    except Exception as e:
+        return f"Excel extraction error: {str(e)}"
 def extract_text_from_url(url):
+    try:
+        r = requests.get(url, timeout=10)
+        soup = BeautifulSoup(r.text, "lxml")
+        for s in soup(["script", "style", "aside", "nav", "footer"]):
+            s.decompose()
+        text = soup.get_text(separator="\n")
+        return text.strip()
+    except Exception as e:
+        return f"URL extraction error: {str(e)}"
 # ==============================
+# Text chunking setup (optimized for speed: larger chunks = fewer embeddings)
 # ==============================
+splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
 # ==============================
 # Ingestion function
     docs = []
     metadata = []
+    # Skip if index exists (to persist across restarts)
+    if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
+        return "Index already exists. Clear files to re-ingest."
     # Handle uploaded files
+    for f in files or []:
         tmp = tempfile.NamedTemporaryFile(delete=False)
+        name = getattr(f, "name", "uploaded_file")
         try:
+            # Robust handling for Gradio file types
+            data = None
             if hasattr(f, "read"):
+                data = f.read()
+            elif isinstance(f, str):
+                data = f.encode("utf-8")
+            elif isinstance(f, dict) and "data" in f:
+                data = f["data"]
+                if isinstance(data, str):
+                    data = data.encode("utf-8")
+            if data is None:
+                raise ValueError(f"Unknown file type: {type(f)}")
+            tmp.write(data)
             tmp.flush()
             tmp.close()
+            low_name = name.lower()
+            if low_name.endswith(".pdf"):
                 text = extract_text_from_pdf(tmp.name)
+            elif low_name.endswith(".docx"):
                 text = extract_text_from_docx(tmp.name)
+            elif low_name.endswith((".xls", ".xlsx")):
                 text = extract_text_from_excel(tmp.name)
             else:
                 with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
                     text = fh.read()
+        except Exception as e:
+            return f"File processing error for {name}: {str(e)}"
         finally:
+            if os.path.exists(tmp.name):
+                os.unlink(tmp.name)
+        if "error" in text.lower():
+            continue  # Skip failed extractions
         chunks = splitter.split_text(text)
         for i, c in enumerate(chunks):
             metadata.append({"source": name, "chunk": i, "type": "file", "text": c})
     # Handle URLs
+    for u in (urls or "").splitlines():
+        u = u.strip()
+        if not u:
+            continue
+        text = extract_text_from_url(u)
+        if "error" in text.lower():
             continue
+        chunks = splitter.split_text(text)
+        for i, c in enumerate(chunks):
+            docs.append(c)
+            metadata.append({"source": u, "chunk": i, "type": "url", "text": c})
     if not docs:
+        return "No valid text extracted. Check files/URLs."
+    try:
+        embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
+    except Exception as e:
+        return f"Embedding failed: {str(e)}"
+    dim = embeddings.shape[1]
     if os.path.exists(INDEX_PATH):
         index = faiss.read_index(INDEX_PATH)
         old_meta = json.load(open(METADATA_PATH, "r", encoding="utf-8"))
         index.add(embeddings)
         old_meta.extend(metadata)
+        with open(METADATA_PATH, "w", encoding="utf-8") as fh:
+            json.dump(old_meta, fh)
     else:
         index = faiss.IndexFlatL2(dim)
         index.add(embeddings)
+        with open(METADATA_PATH, "w", encoding="utf-8") as fh:
+            json.dump(metadata, fh)
     faiss.write_index(index, INDEX_PATH)
+    return f"Ingested {len(docs)} chunks from {len(files or [])} files and {len(urls.splitlines())} URLs."
 # ==============================
 # Retrieve top matching chunks
     q_emb = embed_model.encode([query], convert_to_numpy=True)
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
+    with open(METADATA_PATH, "r", encoding="utf-8") as fh:
+        metadata = json.load(fh)
+    results = [metadata[idx] for idx in I[0] if idx < len(metadata)]
     return results
 # ==============================
+# Generation pipeline (force CPU for free tier)
 # ==============================
 gen_pipeline = pipeline(
     "text2text-generation",
     model=HF_GENERATION_MODEL,
+    device=-1,  # CPU only
 )
 # ==============================
 # Ask prompt
 # ==============================
 def ask_prompt(prompt, top_k=5):
     hits = retrieve_topk(prompt, k=top_k)
     if not hits:
+        return "No relevant content found. Ingest files/URLs first."
+    context_parts = [h.get("text", "") for h in hits]
     sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
+    context = "\n\n".join(filter(None, context_parts))  # Skip empty texts
+    if not context:
+        return "No usable text in retrieved chunks."
     system_instruction = (
+        "You are a helpful research assistant. Use only the provided context to answer the question accurately and concisely."
     )
+    full_prompt = f"{system_instruction}\n\nContext:\n{context}\n\nQuestion:\n{prompt}\n\nAnswer:"
     try:
+        out = gen_pipeline(full_prompt, max_length=512, do_sample=False)[0]["generated_text"]
     except Exception as e:
+        return f"Generation failed: {str(e)}"
+    return f"{out}\n\nSources:\n" + "\n".join(sources)
 # ==============================
 # Gradio UI
 # ==============================
 with gr.Blocks() as demo:
+    gr.Markdown("# Research Assistant Prototype\nUpload PDFs, Word, Excel, or paste URLs. Ingest, then ask questions.")
     with gr.Row():
         with gr.Column():
+            file_in = gr.File(label="Upload files (PDF/DOCX/XLSX/TXT)", file_count="multiple")
+            urls_in = gr.Textbox(label="URLs (one per line)", placeholder="https://example.com")
             ingest_btn = gr.Button("Ingest")
+            ingest_output = gr.Textbox(label="Status")
         with gr.Column():
+            prompt_in = gr.Textbox(label="Question", lines=4)
             ask_btn = gr.Button("Ask")
             answer_out = gr.Textbox(label="Answer", lines=12)
+    ingest_btn.click(ingest_sources, inputs=[file_in, urls_in], outputs=ingest_output)
+    ask_btn.click(ask_prompt, inputs=prompt_in, outputs=answer_out)
 if __name__ == "__main__":
+    demo.launch()