File size: 3,403 Bytes
5924739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import json
import numpy as np
import pandas as pd
import faiss
import anthropic
import streamlit as st
from sentence_transformers import SentenceTransformer
from pathlib import Path

st.set_page_config(
    page_title="DAS / Fiber Optics Research Assistant",
    page_icon="",
    layout="centered",
)

st.title("DAS & Fiber Optics Research Assistant")
st.caption(
    "Ask questions about distributed acoustic sensing (DAS) and fiber optic "
    "technology in oilfield applications. Answers are grounded in scientific papers."
)

@st.cache_resource
def load_rag():
    index_dir = Path("rag_index")
    with open(index_dir / "config.json") as f:
        cfg = json.load(f)
    model = SentenceTransformer(cfg["embed_model"])
    index = faiss.read_index(str(index_dir / "chunks.index"))
    chunks = pd.read_parquet(index_dir / "chunks.parquet")
    return model, index, chunks

embed_model, faiss_index, df_chunks = load_rag()

def retrieve(query: str, top_k: int = 5):
    q_emb = embed_model.encode([query], normalize_embeddings=True).astype(np.float32)
    scores, indices = faiss_index.search(q_emb, top_k)
    results = df_chunks.iloc[indices[0]].copy()
    results["score"] = scores[0]
    return results

def build_prompt(query: str, chunks_df) -> str:
    parts = []
    for i, (_, row) in enumerate(chunks_df.iterrows(), 1):
        parts.append(f"[Source {i}] {row.doc_title} ({row.doc_year})\n{row.text}")
    context = "\n\n".join(parts)
    return (
        "You are a technical assistant specializing in fiber optic sensing "
        "and DAS for oilfield applications. Answer using ONLY the excerpts below. "
        "Cite sources like [Source N].\n\n"
        f"--- EXCERPTS ---\n{context}\n--- END ---\n\n"
        f"Question: {query}\n\nAnswer:"
    )

with st.form("query_form"):
    query = st.text_area(
        "Your question",
        placeholder="e.g. How does DAS detect hydraulic fracture propagation?",
        height=100,
    )
    top_k = st.slider("Number of sources to retrieve", 3, 10, 5)
    submitted = st.form_submit_button("Ask", type="primary")

if submitted:
    if not query.strip():
        st.warning("Please enter a question.")
        st.stop()

    with st.spinner("Searching papers and generating answer..."):
        try:
            retrieved = retrieve(query, top_k=top_k)
            prompt = build_prompt(query, retrieved)

            api_key = os.environ.get("ANTHROPIC_API_KEY", "")
            if not api_key:
                st.error("ANTHROPIC_API_KEY not set in Space secrets.")
                st.stop()

            client = anthropic.Anthropic(api_key=api_key)
            message = client.messages.create(
                model="claude-haiku-4-5-20251001",
                max_tokens=1024,
                messages=[{"role": "user", "content": prompt}],
            )
            answer = message.content[0].text

        except Exception as e:
            st.error(f"Something went wrong: {e}")
            st.stop()

    st.subheader("Answer")
    st.write(answer)

    st.subheader("Retrieved sources")
    for i, (_, row) in enumerate(retrieved.iterrows(), 1):
        with st.expander(f"[Source {i}] {row.doc_title} (score: {row.score:.3f})"):
            st.write(f"**Year:** {row.doc_year}")
            if row.doc_url.startswith("http"):
                st.write(f"**URL:** {row.doc_url}")
            st.write(row.text)