from __future__ import annotations import sys import subprocess from pathlib import Path from typing import List import re import streamlit as st from sentence_transformers import SentenceTransformer from search_engine import SemanticSearchEngine # ================= CONFIG ================= DATASET_PATH = Path("data/stackoverflow_sample_3000.json") # ================= DATASET SETUP ================= def ensure_dataset(): if not DATASET_PATH.exists(): with st.spinner("Preparing dataset (first run only)..."): script = Path(__file__).parent / "prepare_stackoverflow_sample.py" result = subprocess.run( [sys.executable, str(script)], capture_output=True, text=True ) if result.returncode != 0: st.error(f"Dataset preparation failed:\n\n{result.stderr}") st.stop() # ================= ENGINE ================= @st.cache_resource(show_spinner=False) def load_engine() -> SemanticSearchEngine: return SemanticSearchEngine(DATASET_PATH) # ================= EMBEDDING ================= @st.cache_resource(show_spinner=False) def load_embedder() -> SentenceTransformer: return SentenceTransformer("all-MiniLM-L6-v2") def get_query_embedding(query: str) -> List[float]: model = load_embedder() return model.encode(query).tolist() # ================= HTML → MARKDOWN RENDER ================= def render_answer(answer: str): """ Converts StackOverflow-style HTML into clean Streamlit output """ # Split text and code blocks parts = re.split(r"
|", answer)
for i, part in enumerate(parts):
if i % 2 == 0:
# Normal text → remove simple HTML tags
clean_text = re.sub(r"<.*?>", "", part)
if clean_text.strip():
st.markdown(clean_text)
else:
# Code block
code = part.strip()
st.code(code, language="python") # default language
# ================= CHAT STATE =================
def init_chat():
if "messages" not in st.session_state:
st.session_state.messages = []
# ================= MAIN APP =================
def main():
st.set_page_config(
page_title="CodeSeek AI",
page_icon="🔎",
layout="wide"
)
init_chat()
ensure_dataset()
# ================= SIDEBAR =================
with st.sidebar:
st.title("⚙️ Settings")
top_k = st.slider("Number of results", 1, 10, 5)
st.markdown("---")
if st.button("🗑️ Clear Chat"):
st.session_state.messages = []
st.rerun()
# ================= HEADER =================
st.title("🔎 CodeSeek AI")
st.caption("Semantic Programming Search (Chat Style)")
# ================= DISPLAY CHAT =================
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
if msg["role"] == "assistant":
render_answer(msg["content"])
else:
st.markdown(msg["content"])
# ================= INPUT =================
user_input = st.chat_input("Ask a programming question...")
if user_input:
# Store user message
st.session_state.messages.append({
"role": "user",
"content": user_input
})
with st.chat_message("user"):
st.markdown(user_input)
# Generate response
try:
with st.chat_message("assistant"):
with st.spinner("🔎 Searching..."):
engine = load_engine()
query_embedding = get_query_embedding(user_input.strip())
results = engine.search(query_embedding, top_k=top_k)
# Save raw response for rendering
full_response = ""
for i, item in enumerate(results, start=1):
st.markdown(f"### 🔹 Result {i}")
st.markdown(f"**{item['question']}**")
render_answer(item["answer"])
st.caption(f"Score: {item['score']:.4f}")
st.divider()
# Save for history (raw)
full_response += f"{item['question']}\n{item['answer']}\n"
except Exception as e:
full_response = f"Error: {e}"
with st.chat_message("assistant"):
st.error(full_response)
# Store assistant message
st.session_state.messages.append({
"role": "assistant",
"content": full_response
})
# ================= RUN =================
if __name__ == "__main__":
main()