Spaces:

Morinash
/

notebookLM

Sleeping

App Files Files Community

Morinash commited on Oct 14, 2025

Commit

6bb5c19

verified ·

1 Parent(s): 5811f5d

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -86

app.py CHANGED Viewed

@@ -15,32 +15,28 @@ import numpy as np
 from transformers import pipeline
 # CONFIG
-HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")  # change to DeepSeek if ready
-EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
-# load embedding model
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
-# helper extractors
-def extract_text_from_pdf(file):
-    reader = PdfReader(file)
-    pages = []
-    for p in reader.pages:
-        text = p.extract_text() or ""
-        pages.append(text)
-    return "\n\n".join(pages)
-def extract_text_from_docx(file):
-    doc = docx.Document(file)
     return "\n\n".join(p.text for p in doc.paragraphs)
-def extract_text_from_excel(file):
-    df_dict = pd.read_excel(file, sheet_name=None)
     out = []
-    for sheet, df in df_dict.items():
-        out.append(f"Sheet: {sheet}")
         out.append(df.fillna("").to_csv(index=False))
     return "\n\n".join(out)
@@ -49,37 +45,30 @@ def extract_text_from_url(url):
     soup = BeautifulSoup(r.text, "lxml")
     for s in soup(["script", "style", "aside", "nav", "footer"]):
         s.decompose()
-    text = soup.get_text(separator="\n")
-    return text
-# chunker
-splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
 def ingest_sources(files, urls):
-    docs = []
-    metadata = []
     for f in files:
-        # make sure we have a temp file
         tmp = tempfile.NamedTemporaryFile(delete=False)
-        # handle different types of file objects
-        if hasattr(f, "read"):  # normal file
             tmp.write(f.read())
-            name = getattr(f, "name", "uploaded_file")
-        elif isinstance(f, str):  # NamedString or text
-            tmp.write(f.encode("utf-8"))
-            name = "uploaded_text.txt"
-        elif isinstance(f, dict) and "data" in f:  # HF file dict
-            tmp.write(f["data"])
-            name = f.get("name", "uploaded_file")
         else:
-            raise ValueError(f"Unknown file type: {type(f)}")
         tmp.flush()
         tmp.close()
-        # extract text depending on file type
         if name.lower().endswith(".pdf"):
             text = extract_text_from_pdf(tmp.name)
         elif name.lower().endswith(".docx"):
@@ -89,99 +78,83 @@ def ingest_sources(files, urls):
         else:
             with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
                 text = fh.read()
         os.unlink(tmp.name)
-        chunks = splitter.split_text(text)
-        for i, c in enumerate(chunks):
             docs.append(c)
             metadata.append({"source": name, "chunk": i, "type": "file"})
-    # handle URLs
     for u in urls:
-        u = u.strip()
-        if not u:
-            continue
         try:
             text = extract_text_from_url(u)
-            chunks = splitter.split_text(text)
-            for i, c in enumerate(chunks):
                 docs.append(c)
                 metadata.append({"source": u, "chunk": i, "type": "url"})
         except Exception as e:
-            print("url error", u, e)
     if not docs:
-        return "No valid documents or URLs found."
     embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
-    dim = embeddings.shape[1]
-    if os.path.exists(INDEX_PATH):
-        index = faiss.read_index(INDEX_PATH)
-        old_meta = json.load(open(METADATA_PATH, "r"))
-        index.add(embeddings)
-        old_meta.extend(metadata)
-        json.dump(old_meta, open(METADATA_PATH, "w"))
-    else:
-        index = faiss.IndexFlatL2(dim)
-        index.add(embeddings)
-        json.dump(metadata, open(METADATA_PATH, "w"))
     faiss.write_index(index, INDEX_PATH)
-    return f"Ingested {len(docs)} chunks from {len(files)} files and {len(urls)} urls."
 def retrieve_topk(query, k=5):
-    q_emb = embed_model.encode([query], convert_to_numpy=True)
     if not os.path.exists(INDEX_PATH):
         return []
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
-    metadata = json.load(open(METADATA_PATH, "r"))
     results = []
     for idx in I[0]:
         if idx < len(metadata):
-            results.append((metadata[idx], idx))
     return results
-gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL, device=0 if os.environ.get("HF_DEVICE", "cpu") != "cpu" else -1)
 def ask_prompt(prompt, top_k=5):
     hits = retrieve_topk(prompt, k=top_k)
     if not hits:
-        return "No documents ingested. Use Ingest first."
-    context_parts = []
-    sources = []
-    for meta, idx in hits:
-        sources.append(f"{meta['source']} (chunk {meta['chunk']})")
-        context_parts.append(f"[{meta['source']} - chunk {meta['chunk']}]")
-    context = "\n\n".join(context_parts)
     system_instruction = (
-        "You are an AI research assistant. Use the contextual chunks below to answer the user's question. "
-        "Provide a concise answer, then list sources in order of relevance."
     )
-    prompt_text = f"{system_instruction}\n\nCONTEXT:\n{context}\n\nQUESTION:\n{prompt}\n\nAnswer:"
-    out = gen_pipeline(prompt_text, max_length=512, do_sample=False)[0]["generated_text"]
-    out = out + "\n\nSources:\n" + "\n".join(sources)
-    return out
-# Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# Research Assistant (prototype)\nUpload files and/or provide URLs, click Ingest, then Ask a question.")
     with gr.Row():
         with gr.Column():
-            file_in = gr.File(label="Upload files (pdf/docx/xlsx/txt)", file_count="multiple")
-            urls_in = gr.Textbox(label="URLs (one per line)", placeholder="https://example.com/article")
             ingest_btn = gr.Button("Ingest")
             ingest_output = gr.Textbox(label="Ingest status")
         with gr.Column():
-            prompt_in = gr.Textbox(label="Your question", lines=4)
             ask_btn = gr.Button("Ask")
-            answer_out = gr.Textbox(label="Answer", lines=12)
-    ingest_btn.click(lambda files, urls: ingest_sources(files or [], (urls or "").splitlines()), inputs=[file_in, urls_in], outputs=ingest_output)
     ask_btn.click(lambda p: ask_prompt(p, top_k=5), inputs=prompt_in, outputs=answer_out)
 if __name__ == "__main__":

 from transformers import pipeline
 # CONFIG
+HF_GENERATION_MODEL = os.environ.get("HF_GENERATION_MODEL", "google/flan-t5-large")  # or another HF model
+EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"  # smaller + faster
 INDEX_PATH = "faiss_index.index"
 METADATA_PATH = "metadata.json"
+# Load embedding model (small + CPU efficient)
 embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+# --- Helpers ---
+def extract_text_from_pdf(file_path):
+    reader = PdfReader(file_path)
+    return "\n\n".join(page.extract_text() or "" for page in reader.pages)
+def extract_text_from_docx(file_path):
+    doc = docx.Document(file_path)
     return "\n\n".join(p.text for p in doc.paragraphs)
+def extract_text_from_excel(file_path):
+    dfs = pd.read_excel(file_path, sheet_name=None)
     out = []
+    for name, df in dfs.items():
+        out.append(f"Sheet: {name}")
         out.append(df.fillna("").to_csv(index=False))
     return "\n\n".join(out)
     soup = BeautifulSoup(r.text, "lxml")
     for s in soup(["script", "style", "aside", "nav", "footer"]):
         s.decompose()
+    return soup.get_text(separator="\n")
+# --- Chunker (larger chunks = fewer embeddings) ---
+splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
+# --- Ingest sources ---
 def ingest_sources(files, urls):
+    docs, metadata = [], []
+    # Skip if already indexed
+    if os.path.exists(INDEX_PATH) and os.path.exists(METADATA_PATH):
+        return "Already have an index. Delete existing files to re-ingest."
     for f in files:
         tmp = tempfile.NamedTemporaryFile(delete=False)
+        if hasattr(f, "read"):
             tmp.write(f.read())
         else:
+            tmp.write(f.encode("utf-8"))
         tmp.flush()
         tmp.close()
+        name = getattr(f, "name", "uploaded_file")
         if name.lower().endswith(".pdf"):
             text = extract_text_from_pdf(tmp.name)
         elif name.lower().endswith(".docx"):
         else:
             with open(tmp.name, "r", encoding="utf-8", errors="ignore") as fh:
                 text = fh.read()
         os.unlink(tmp.name)
+        for i, c in enumerate(splitter.split_text(text)):
             docs.append(c)
             metadata.append({"source": name, "chunk": i, "type": "file"})
     for u in urls:
         try:
             text = extract_text_from_url(u)
+            for i, c in enumerate(splitter.split_text(text)):
                 docs.append(c)
                 metadata.append({"source": u, "chunk": i, "type": "url"})
         except Exception as e:
+            print(f"URL error for {u}: {e}")
     if not docs:
+        return "No valid content found."
     embeddings = embed_model.encode(docs, show_progress_bar=True, convert_to_numpy=True)
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
     faiss.write_index(index, INDEX_PATH)
+    json.dump(metadata, open(METADATA_PATH, "w"))
+    return f"Ingested {len(docs)} text chunks."
+# --- Retrieval ---
 def retrieve_topk(query, k=5):
     if not os.path.exists(INDEX_PATH):
         return []
+    q_emb = embed_model.encode([query], convert_to_numpy=True)
     index = faiss.read_index(INDEX_PATH)
     D, I = index.search(q_emb, k)
+    metadata = json.load(open(METADATA_PATH))
     results = []
     for idx in I[0]:
         if idx < len(metadata):
+            results.append(metadata[idx])
     return results
+# --- QA pipeline ---
+gen_pipeline = pipeline("text2text-generation", model=HF_GENERATION_MODEL)
 def ask_prompt(prompt, top_k=5):
     hits = retrieve_topk(prompt, k=top_k)
     if not hits:
+        return "No documents ingested yet."
+    sources = [f"{h['source']} (chunk {h['chunk']})" for h in hits]
+    context = "\n\n".join(sources)
     system_instruction = (
+        "You are a research assistant. Use the context below to answer the question clearly and briefly.\n"
     )
+    full_prompt = f"{system_instruction}\nCONTEXT:\n{context}\n\nQUESTION:\n{prompt}\n\nAnswer:"
+    out = gen_pipeline(full_prompt, max_length=400, do_sample=False)[0]["generated_text"]
+    return out + "\n\nSources:\n" + "\n".join(sources)
+# --- Gradio UI ---
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 Research Assistant (light version)\nUpload PDFs, Docs, Excel, or URLs. Click **Ingest**, then ask your question.")
     with gr.Row():
         with gr.Column():
+            file_in = gr.File(label="Upload files", file_count="multiple")
+            urls_in = gr.Textbox(label="URLs (one per line)", placeholder="https://example.com")
             ingest_btn = gr.Button("Ingest")
             ingest_output = gr.Textbox(label="Ingest status")
         with gr.Column():
+            prompt_in = gr.Textbox(label="Your question", lines=3)
             ask_btn = gr.Button("Ask")
+            answer_out = gr.Textbox(label="Answer", lines=10)
+    ingest_btn.click(lambda f, u: ingest_sources(f or [], (u or "").splitlines()), inputs=[file_in, urls_in], outputs=ingest_output)
     ask_btn.click(lambda p: ask_prompt(p, top_k=5), inputs=prompt_in, outputs=answer_out)
 if __name__ == "__main__":