from __future__ import annotations import sys import subprocess from pathlib import Path from typing import List import re import streamlit as st from sentence_transformers import SentenceTransformer from search_engine import SemanticSearchEngine # ================= CONFIG ================= DATASET_PATH = Path("data/stackoverflow_sample_3000.json") # ================= DATASET SETUP ================= def ensure_dataset(): if not DATASET_PATH.exists(): with st.spinner("Preparing dataset (first run only)..."): script = Path(__file__).parent / "prepare_stackoverflow_sample.py" result = subprocess.run( [sys.executable, str(script)], capture_output=True, text=True ) if result.returncode != 0: st.error(f"Dataset preparation failed:\n\n{result.stderr}") st.stop() # ================= ENGINE ================= @st.cache_resource(show_spinner=False) def load_engine() -> SemanticSearchEngine: return SemanticSearchEngine(DATASET_PATH) # ================= EMBEDDING ================= @st.cache_resource(show_spinner=False) def load_embedder() -> SentenceTransformer: return SentenceTransformer("all-MiniLM-L6-v2") def get_query_embedding(query: str) -> List[float]: model = load_embedder() return model.encode(query).tolist() # ================= HTML → MARKDOWN RENDER ================= def render_answer(answer: str): """ Converts StackOverflow-style HTML into clean Streamlit output """ # Split text and code blocks parts = re.split(r"
|
", answer) for i, part in enumerate(parts): if i % 2 == 0: # Normal text → remove simple HTML tags clean_text = re.sub(r"<.*?>", "", part) if clean_text.strip(): st.markdown(clean_text) else: # Code block code = part.strip() st.code(code, language="python") # default language # ================= CHAT STATE ================= def init_chat(): if "messages" not in st.session_state: st.session_state.messages = [] # ================= MAIN APP ================= def main(): st.set_page_config( page_title="CodeSeek AI", page_icon="🔎", layout="wide" ) init_chat() ensure_dataset() # ================= SIDEBAR ================= with st.sidebar: st.title("⚙️ Settings") top_k = st.slider("Number of results", 1, 10, 5) st.markdown("---") if st.button("🗑️ Clear Chat"): st.session_state.messages = [] st.rerun() # ================= HEADER ================= st.title("🔎 CodeSeek AI") st.caption("Semantic Programming Search (Chat Style)") # ================= DISPLAY CHAT ================= for msg in st.session_state.messages: with st.chat_message(msg["role"]): if msg["role"] == "assistant": render_answer(msg["content"]) else: st.markdown(msg["content"]) # ================= INPUT ================= user_input = st.chat_input("Ask a programming question...") if user_input: # Store user message st.session_state.messages.append({ "role": "user", "content": user_input }) with st.chat_message("user"): st.markdown(user_input) # Generate response try: with st.chat_message("assistant"): with st.spinner("🔎 Searching..."): engine = load_engine() query_embedding = get_query_embedding(user_input.strip()) results = engine.search(query_embedding, top_k=top_k) # Save raw response for rendering full_response = "" for i, item in enumerate(results, start=1): st.markdown(f"### 🔹 Result {i}") st.markdown(f"**{item['question']}**") render_answer(item["answer"]) st.caption(f"Score: {item['score']:.4f}") st.divider() # Save for history (raw) full_response += f"{item['question']}\n{item['answer']}\n" except Exception as e: full_response = f"Error: {e}" with st.chat_message("assistant"): st.error(full_response) # Store assistant message st.session_state.messages.append({ "role": "assistant", "content": full_response }) # ================= RUN ================= if __name__ == "__main__": main()