Spaces:

Deevyankar
/

BrainChat

Sleeping

App Files Files Community

Deevyankar commited on 27 days ago

Commit

8ba2439

1 Parent(s): 3c9300b

Create app.py

Browse files

Files changed (1) hide show

app.py +187 -0

app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import subprocess
+from typing import List, Dict, Any
+import gradio as gr
+import chromadb
+from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.openai import OpenAI
+COLLECTION_NAME = "neuro_course"
+INDEX = None
+def get_persist_dir():
+    return "/data/chroma" if os.path.exists("/data") else "storage/chroma"
+def processed_text_exists():
+    chapter_dir = "processed/chapters"
+    return os.path.exists(chapter_dir) and any(
+        f.endswith(".txt") for f in os.listdir(chapter_dir)
+    )
+def vector_db_exists():
+    persist_dir = get_persist_dir()
+    return os.path.exists(persist_dir) and len(os.listdir(persist_dir)) > 0
+def run_extract_if_needed():
+    if not processed_text_exists():
+        print("No processed chapter text found. Running extraction...")
+        subprocess.check_call(["python", "extract_all_pdfs_chapterwise.py"])
+    else:
+        print("Processed chapter text already exists. Skipping extraction.")
+def run_ingest_if_needed():
+    if not vector_db_exists():
+        print("No vector DB found. Running ingestion...")
+        subprocess.check_call(["python", "ingest.py"])
+    else:
+        print("Vector DB already exists. Skipping ingestion.")
+def ensure_everything_ready():
+    run_extract_if_needed()
+    run_ingest_if_needed()
+def load_index():
+    persist_dir = get_persist_dir()
+    client = chromadb.PersistentClient(path=persist_dir)
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    embed_model = HuggingFaceEmbedding(
+        model_name="intfloat/multilingual-e5-base"
+    )
+    return VectorStoreIndex.from_vector_store(
+        vector_store=vector_store,
+        storage_context=storage_context,
+        embed_model=embed_model
+    )
+def get_index():
+    global INDEX
+    if INDEX is None:
+        ensure_everything_ready()
+        INDEX = load_index()
+    return INDEX
+def format_sources(response, max_sources=3):
+    output = ""
+    if hasattr(response, "source_nodes") and response.source_nodes:
+        output += "\n\n---\n### Sources\n"
+        for i, sn in enumerate(response.source_nodes[:max_sources], start=1):
+            meta = sn.node.metadata or {}
+            file_name = meta.get("file_name", "unknown_file")
+            snippet = sn.node.get_text()[:250].replace("\n", " ")
+            output += f"\n**{i}. {file_name}**\n> {snippet}...\n"
+    return output
+def respond(
+    message: str,
+    history: List[Dict[str, Any]],
+    model_name: str,
+    temperature: float,
+    top_k: int,
+    show_sources: bool,
+):
+    if history is None:
+        history = []
+    if not message or not message.strip():
+        return history, ""
+    if not os.getenv("OPENAI_API_KEY"):
+        history = history + [{
+            "role": "assistant",
+            "content": "OPENAI_API_KEY missing. Add it in Hugging Face Space secrets."
+        }]
+        return history, ""
+    history = history + [{"role": "user", "content": message.strip()}]
+    try:
+        index = get_index()
+        llm = OpenAI(model=model_name, temperature=float(temperature))
+        query_engine = index.as_query_engine(
+            llm=llm,
+            similarity_top_k=int(top_k),
+            response_mode="compact"
+        )
+        prompt = (
+            "You are an interactive neurology tutor. "
+            "Answer only from the retrieved course material. "
+            "If the answer is not found, say: 'Not found in the course material.' "
+            "Keep answers concise unless the user asks for detail.\n\n"
+            f"Question: {message.strip()}"
+        )
+        response = query_engine.query(prompt)
+        answer = str(response)
+        if show_sources:
+            answer += format_sources(response, max_sources=min(int(top_k), 3))
+    except Exception as e:
+        answer = f"Error: {str(e)}"
+    history = history + [{"role": "assistant", "content": answer}]
+    return history, ""
+def clear_chat():
+    return []
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 Neurology Tutor")
+    gr.Markdown("Automatic pipeline: PDF extraction → chapter text → vector DB → chatbot")
+    chatbot = gr.Chatbot(height=500, type="messages")
+    msg = gr.Textbox(placeholder="Ask a question...", lines=1)
+    with gr.Row():
+        model_name = gr.Dropdown(
+            ["gpt-4o-mini", "gpt-4.1-mini"],
+            value="gpt-4o-mini",
+            label="Model"
+        )
+        temperature = gr.Slider(0.0, 0.8, value=0.2, step=0.1, label="Temperature")
+    with gr.Row():
+        top_k = gr.Slider(1, 5, value=3, step=1, label="Top-K Chunks")
+        show_sources = gr.Checkbox(value=False, label="Show Sources")
+    clear_btn = gr.Button("Clear Chat")
+    msg.submit(
+        respond,
+        inputs=[msg, chatbot, model_name, temperature, top_k, show_sources],
+        outputs=[chatbot, msg]
+    )
+    clear_btn.click(
+        clear_chat,
+        inputs=[],
+        outputs=[chatbot]
+    )
+if __name__ == "__main__":
+    demo.launch()