Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

baeb9d2

verified ·

1 Parent(s): a708a4d

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -44

app.py CHANGED Viewed

@@ -14,16 +14,20 @@ import faiss
 import numpy as np
 from transformers import pipeline
 # CONFIG
-HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")  # or another HF model
-EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # smaller + faster
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
-# Load embedding model (small + CPU efficient)
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
-# --- Helpers ---
 def extract_text_from_pdf(file_path):
     reader = PdfReader(file_path)
     return "\n\n".join(page.extract_text() or "" for page in reader.pages)
@@ -47,65 +51,103 @@ def extract_text_from_url(url):
         s.decompose()
     return soup.get_text(separator="\n")
-# --- Chunker (larger chunks = fewer embeddings) ---
 splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
-# --- Ingest sources ---
 def ingest_sources(files, urls):
     docs, metadata = [], []
-    # Skip if already indexed
     if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
-        return "Already have an index. Delete existing files to re-ingest."
     for f in files:
         tmp = tempfile.NamedTemporaryFile(delete=False)
-        if hasattr(f, "read"):
-            tmp.write(f.read())
-        else:
-            tmp.write(f.encode("utf-8"))
-        tmp.flush()
-        tmp.close()
-        name = getattr(f, "name", "uploaded_file")
-        if name.lower().endswith(".pdf"):
-            text = extract_text_from_pdf(tmp.name)
-        elif name.lower().endswith(".docx"):
-            text = extract_text_from_docx(tmp.name)
-        elif name.lower().endswith((".xls", ".xlsx")):
-            text = extract_text_from_excel(tmp.name)
-        else:
-            with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
-                text = fh.read()
         os.unlink(tmp.name)
         for i, c in enumerate(splitter.split_text(text)):
             docs.append(c)
             metadata.append({"source": name, "chunk": i, "type": "file"})
-    for u in urls:
         try:
             text = extract_text_from_url(u)
             for i, c in enumerate(splitter.split_text(text)):
                 docs.append(c)
                 metadata.append({"source": u, "chunk": i, "type": "url"})
         except Exception as e:
-            print(f"URL error for {u}: {e}")
     if not docs:
-        return "No valid content found."
-    embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
-    index = faiss.IndexFlatL2(embeddings.shape[1])
-    index.add(embeddings)
-    faiss.write_index(index, INDEX_PATH)
-    json.dump(metadata, open(METADATA_PATH, "w"))
-    return f"Ingested {len(docs)} text chunks."
-# --- Retrieval ---
 def retrieve_topk(query, k=5):
     if not os.path.exists(INDEX_PATH):
         return []
@@ -119,8 +161,10 @@ def retrieve_topk(query, k=5):
             results.append(metadata[idx])
     return results
-# --- QA pipeline ---
-gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL)
 def ask_prompt(prompt, top_k=5):
     hits = retrieve_topk(prompt, k=top_k)
@@ -138,9 +182,11 @@ def ask_prompt(prompt, top_k=5):
     out = gen_pipeline(full_prompt, max_length=400, do_sample=False)[0]["generated_text"]
     return out + "\n\nSources:\n" + "\n".join(sources)
-# --- Gradio UI ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 Research Assistant (light version)\nUpload PDFs, Docs, Excel, or URLs. Click **Ingest**, then ask your question.")
     with gr.Row():
         with gr.Column():

 import numpy as np
 from transformers import pipeline
+# -----------------------------
 # CONFIG
+# -----------------------------
+HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")  # You can switch later to DeepSeek
+EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # Faster, smaller
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
+# Load embedding model
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+# -----------------------------
+# FILE HELPERS
+# -----------------------------
 def extract_text_from_pdf(file_path):
     reader = PdfReader(file_path)
     return "\n\n".join(page.extract_text() or "" for page in reader.pages)
         s.decompose()
     return soup.get_text(separator="\n")
+# -----------------------------
+# CHUNKER (larger = faster)
+# -----------------------------
 splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
+# -----------------------------
+# INGESTION
+# -----------------------------
 def ingest_sources(files, urls):
     docs, metadata = [], []
     if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
+        return "Index already exists. Delete the files to re-ingest."
     for f in files:
         tmp = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            if hasattr(f, "read"):
+                data = f.read()
+                if isinstance(data, str):
+                    data = data.encode("utf-8")
+                tmp.write(data)
+                name = getattr(f, "name", "uploaded_file")
+            elif isinstance(f, dict) and "data" in f:
+                data = f["data"]
+                if isinstance(data, str):
+                    data = data.encode("utf-8")
+                tmp.write(data)
+                name = f.get("name", "uploaded_file")
+            elif isinstance(f, str):
+                tmp.write(f.encode("utf-8"))
+                name = "uploaded_text.txt"
+            else:
+                tmp.close()
+                os.unlink(tmp.name)
+                return f"Unknown upload type: {type(f)}"
+        finally:
+            tmp.flush()
+            tmp.close()
+        try:
+            low = name.lower()
+            if low.endswith(".pdf"):
+                text = extract_text_from_pdf(tmp.name)
+            elif low.endswith(".docx"):
+                text = extract_text_from_docx(tmp.name)
+            elif low.endswith((".xls", ".xlsx")):
+                text = extract_text_from_excel(tmp.name)
+            else:
+                with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
+                    text = fh.read()
+        except Exception as e:
+            print(f"Extraction error for {name}: {e}")
+            os.unlink(tmp.name)
+            continue
         os.unlink(tmp.name)
         for i, c in enumerate(splitter.split_text(text)):
             docs.append(c)
             metadata.append({"source": name, "chunk": i, "type": "file"})
+    for u in urls or []:
+        u = (u or "").strip()
+        if not u:
+            continue
         try:
             text = extract_text_from_url(u)
             for i, c in enumerate(splitter.split_text(text)):
                 docs.append(c)
                 metadata.append({"source": u, "chunk": i, "type": "url"})
         except Exception as e:
+            print(f"URL fetch error for {u}: {e}")
     if not docs:
+        return "No content ingested (empty or failed files)."
+    try:
+        embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
+    except Exception as e:
+        return f"Embedding error: {e}"
+    try:
+        dim = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dim)
+        index.add(embeddings)
+        faiss.write_index(index, INDEX_PATH)
+        with open(METADATA_PATH, "w", encoding="utf-8") as fh:
+            json.dump(metadata, fh)
+    except Exception as e:
+        return f"Indexing error: {e}"
+    return f"Ingested {len(docs)} chunks from {len(files)} files and {len(urls)} URLs."
+# -----------------------------
+# RETRIEVAL
+# -----------------------------
 def retrieve_topk(query, k=5):
     if not os.path.exists(INDEX_PATH):
         return []
             results.append(metadata[idx])
     return results
+# -----------------------------
+# GENERATION PIPELINE
+# -----------------------------
+gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=-1)
 def ask_prompt(prompt, top_k=5):
     hits = retrieve_topk(prompt, k=top_k)
     out = gen_pipeline(full_prompt, max_length=400, do_sample=False)[0]["generated_text"]
     return out + "\n\nSources:\n" + "\n".join(sources)
+# -----------------------------
+# GRADIO UI
+# -----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 Research Assistant (light version)\nUpload PDFs, Word, Excel, or URLs. Click **Ingest**, then ask your question.")
     with gr.Row():
         with gr.Column():