Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from llama_index.core import VectorStoreIndex, Document, Settings, SimpleDirectoryReader, StorageContext | |
| from llama_index.core.node_parser import SemanticSplitterNodeParser | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.llms.ollama import Ollama | |
| from llama_index.retrievers.bm25 import BM25Retriever | |
| from llama_index.core.retrievers import QueryFusionRetriever | |
| from llama_index.core.retrievers.fusion_retriever import FUSION_MODES | |
| from llama_index.core.chat_engine import CondensePlusContextChatEngine | |
| from llama_index.core.memory import ChatMemoryBuffer | |
| import httpx | |
| import os | |
| import datetime | |
| from huggingface_hub import HfApi, CommitScheduler | |
| from pathlib import Path | |
| import json | |
| import uuid | |
| # --- 1. CONFIGURATION --- | |
| RESEARCHER_NAME = "Enoch Hyunwook Kang" | |
| OLLAMA_BASE_URL = "https://researchbot.share.zrok.io" | |
| OLLAMA_MODEL = "qwen3:8b" | |
| # --- 2. LOGGING SETUP (Hugging Face Dataset) --- | |
| # Create a private dataset on HF (e.g., "ehwkang/researchbot-logs") first! | |
| LOG_DATASET = "ehwkang/researchbot-logs" | |
| LOG_FILE = "qna_logs.jsonl" | |
| scheduler = CommitScheduler( | |
| repo_id=LOG_DATASET, | |
| repo_type="dataset", | |
| folder_path="logs", | |
| path_in_repo="data", | |
| every=10 # Upload every 10 minutes (or on shutdown) | |
| ) | |
| def log_interaction(question, answer): | |
| # Determine the log file path | |
| log_path = Path("logs") / LOG_FILE | |
| log_path.parent.mkdir(parents=True, exist_ok=True) | |
| entry = { | |
| "timestamp": datetime.datetime.now().isoformat(), | |
| "session_id": st.session_state.get("session_id"), | |
| "question": question, | |
| "answer": str(answer) | |
| } | |
| with scheduler.lock: | |
| with log_path.open("a") as f: | |
| f.write(json.dumps(entry) + "\n") | |
| # --- 3. CUSTOM OLLAMA CLIENT --- | |
| class CustomOllama(Ollama): | |
| def _get_client(self): | |
| return httpx.Client( | |
| base_url=self.base_url, | |
| timeout=120.0, | |
| headers={"skip_zrok_interstitial": "true"} | |
| ) | |
| # --- 4. SETUP --- | |
| st.set_page_config(page_title=f"{RESEARCHER_NAME}'s Research", layout="centered") | |
| if "session_id" not in st.session_state: | |
| st.session_state.session_id = str(uuid.uuid4()) | |
| # Initialize Models | |
| try: | |
| # Embedding Model (Runs on HF CPU - lightweight) | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| Settings.embed_model = embed_model | |
| # LLM (Runs on your Local GPU) | |
| Settings.llm = CustomOllama( | |
| model=OLLAMA_MODEL, | |
| base_url=OLLAMA_BASE_URL, | |
| request_timeout=120.0, | |
| context_window=8192, # 8k is usually enough for RAG | |
| temperature=0.3 # Lower temp for factual research answers | |
| ) | |
| except Exception as e: | |
| st.error(f"Configuration Error: {e}") | |
| # --- 5. INTELLIGENT INDEXING (Semantic + Hybrid) --- | |
| def load_resources(): | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # A. Load CV | |
| cv_text = "" | |
| cv_path = os.path.join(script_dir, "CV.txt") | |
| if os.path.exists(cv_path): | |
| with open(cv_path, "r", encoding="utf-8") as f: | |
| cv_text = f.read() | |
| # B. Load Papers & Build Index | |
| data_dir = os.path.join(script_dir, "data") | |
| if not os.path.exists(data_dir): | |
| return cv_text, None | |
| documents = SimpleDirectoryReader(data_dir, required_exts=[".txt"], recursive=True).load_data() | |
| if not documents: | |
| return cv_text, None | |
| # SOTA 1: Semantic Chunking (Splits by meaning, not just line count) | |
| # Note: This runs on CPU (HF Spaces), so it might take 30-60s on boot. | |
| splitter = SemanticSplitterNodeParser( | |
| buffer_size=1, | |
| breakpoint_percentile_threshold=95, | |
| embed_model=embed_model | |
| ) | |
| nodes = splitter.get_nodes_from_documents(documents) | |
| # Create Vector Index | |
| vector_index = VectorStoreIndex(nodes) | |
| return cv_text, vector_index, nodes | |
| cv_content, vector_index, all_nodes = load_resources() | |
| # --- 6. HYBRID RETRIEVER & CHAT ENGINE --- | |
| def get_chat_engine(): | |
| if not vector_index: | |
| return None | |
| # SOTA 2: Hybrid Retrieval (Vector + BM25) | |
| # 1. Vector Search (Semantic understanding) | |
| vector_retriever = vector_index.as_retriever(similarity_top_k=5) | |
| # 2. BM25 Search (Keyword precision - crucial for specific algorithm names) | |
| bm25_retriever = BM25Retriever.from_defaults(nodes=all_nodes, similarity_top_k=5) | |
| # 3. Fusion (Combine results) | |
| retriever = QueryFusionRetriever( | |
| [vector_retriever, bm25_retriever], | |
| similarity_top_k=5, | |
| num_queries=1, | |
| mode=FUSION_MODES.RECIPROCAL_RANK, # <--- USE ENUM (safest) | |
| # OR use mode="reciprocal_rerank" (note the extra 're') | |
| use_async=False, | |
| ) | |
| # SOTA 3: CondensePlusContext | |
| # Handles: "What is its accuracy?" -> "What is the accuracy of [Previous Topic]?" | |
| memory = ChatMemoryBuffer.from_defaults(token_limit=4000) | |
| system_prompt = ( | |
| f"You are {RESEARCHER_NAME}. Answer questions about your research based ONLY on the provided context. " | |
| f"If the answer is not in the context, say you don't know. " | |
| f"Here is your CV for biographical context:\n{cv_content}" | |
| ) | |
| return CondensePlusContextChatEngine.from_defaults( | |
| retriever=retriever, | |
| llm=Settings.llm, | |
| memory=memory, | |
| system_prompt=system_prompt, | |
| verbose=True | |
| ) | |
| chat_engine = get_chat_engine() | |
| # --- 7. CHAT UI --- | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [{"role": "assistant", "content": "Hello! Ask me about my research."}] | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.write(message["content"]) | |
| if prompt := st.chat_input("Ask a question..."): | |
| st.session_state.messages.append({"role": "user", "content": prompt}) | |
| with st.chat_message("user"): | |
| st.write(prompt) | |
| if chat_engine: | |
| with st.chat_message("assistant"): | |
| with st.spinner("Thinking..."): | |
| response = chat_engine.chat(prompt) | |
| st.write(str(response)) | |
| st.session_state.messages.append({"role": "assistant", "content": str(response)}) | |
| # Log to HF Dataset | |
| log_interaction(prompt, response) | |
| else: | |
| st.error("Index not loaded. Check 'data' folder.") |