import os import json import numpy as np import pandas as pd import faiss import anthropic import streamlit as st from sentence_transformers import SentenceTransformer from pathlib import Path st.set_page_config( page_title="DAS / Fiber Optics Research Assistant", page_icon="", layout="centered", ) st.title("DAS & Fiber Optics Research Assistant") st.caption( "Ask questions about distributed acoustic sensing (DAS) and fiber optic " "technology in oilfield applications. Answers are grounded in scientific papers." ) @st.cache_resource def load_rag(): index_dir = Path("rag_index") with open(index_dir / "config.json") as f: cfg = json.load(f) model = SentenceTransformer(cfg["embed_model"]) index = faiss.read_index(str(index_dir / "chunks.index")) chunks = pd.read_parquet(index_dir / "chunks.parquet") return model, index, chunks embed_model, faiss_index, df_chunks = load_rag() def retrieve(query: str, top_k: int = 5): q_emb = embed_model.encode([query], normalize_embeddings=True).astype(np.float32) scores, indices = faiss_index.search(q_emb, top_k) results = df_chunks.iloc[indices[0]].copy() results["score"] = scores[0] return results def build_prompt(query: str, chunks_df) -> str: parts = [] for i, (_, row) in enumerate(chunks_df.iterrows(), 1): parts.append(f"[Source {i}] {row.doc_title} ({row.doc_year})\n{row.text}") context = "\n\n".join(parts) return ( "You are a technical assistant specializing in fiber optic sensing " "and DAS for oilfield applications. Answer using ONLY the excerpts below. " "Cite sources like [Source N].\n\n" f"--- EXCERPTS ---\n{context}\n--- END ---\n\n" f"Question: {query}\n\nAnswer:" ) with st.form("query_form"): query = st.text_area( "Your question", placeholder="e.g. How does DAS detect hydraulic fracture propagation?", height=100, ) top_k = st.slider("Number of sources to retrieve", 3, 10, 5) submitted = st.form_submit_button("Ask", type="primary") if submitted: if not query.strip(): st.warning("Please enter a question.") st.stop() with st.spinner("Searching papers and generating answer..."): try: retrieved = retrieve(query, top_k=top_k) prompt = build_prompt(query, retrieved) api_key = os.environ.get("ANTHROPIC_API_KEY", "") if not api_key: st.error("ANTHROPIC_API_KEY not set in Space secrets.") st.stop() client = anthropic.Anthropic(api_key=api_key) message = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=1024, messages=[{"role": "user", "content": prompt}], ) answer = message.content[0].text except Exception as e: st.error(f"Something went wrong: {e}") st.stop() st.subheader("Answer") st.write(answer) st.subheader("Retrieved sources") for i, (_, row) in enumerate(retrieved.iterrows(), 1): with st.expander(f"[Source {i}] {row.doc_title} (score: {row.score:.3f})"): st.write(f"**Year:** {row.doc_year}") if row.doc_url.startswith("http"): st.write(f"**URL:** {row.doc_url}") st.write(row.text)