Spaces:
Sleeping
Sleeping
| import os | |
| import chromadb | |
| import streamlit as st | |
| from llama_index.core import VectorStoreIndex | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.llms.gemini import Gemini | |
| st.set_page_config( | |
| page_title="Scientific Paper QA", | |
| page_icon="π¬", | |
| layout="wide" | |
| ) | |
| # ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.title("π¬ Paper QA System") | |
| st.markdown( | |
| "A **retrieval-augmented generation (RAG)** system for querying research papers. " | |
| "Ask questions about the indexed research papers." | |
| ) | |
| st.markdown("---") | |
| st.markdown("**How it works**") | |
| st.markdown("1. Your question is embedded into a vector") | |
| st.markdown("2. The 5 most similar paper chunks are retrieved") | |
| st.markdown("3. Gemini Flash answers using only those chunks") | |
| st.markdown("4. Source passages are shown with relevance scores") | |
| st.markdown("---") | |
| st.markdown("**Indexed papers**") | |
| st.markdown("- GraphMetaMat (Maurizi et al., 2025)") | |
| st.markdown("- DiffuMeta (Zheng et al., 2025)") | |
| st.markdown("- High-Entropy Wolframite Oxide") | |
| st.markdown("- Thermoelectric Wolframite Properties") | |
| st.markdown("- Band Diagram on Quantum ESPRESSO") | |
| st.markdown("---") | |
| st.markdown("[π» GitHub](https://github.com/aeesh/paper-qa-system)") | |
| # ββ Example questions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXAMPLES = [ | |
| "Select an example question...", | |
| "What is the main contribution of GraphMetaMat?", | |
| "How does DiffuMeta represent shell geometries?", | |
| "What is the geometric validity rate of DiffuMeta?", | |
| "How many training samples does GraphMetaMat use?", | |
| "What is the workflow for generating a band diagram in Quantum ESPRESSO?", | |
| "What phonon scattering mechanisms explain the ultra-low thermal conductivity of A6WO4?", | |
| ] | |
| # ββ Load query engine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_query_engine(): | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| chroma_client = chromadb.PersistentClient(path="src/chroma_db") | |
| chroma_collection = chroma_client.get_or_create_collection("papers") | |
| vector_store = ChromaVectorStore(chroma_collection=chroma_collection) | |
| llm = Gemini( | |
| model="models/gemini-2.0-flash", | |
| api_key=os.environ["GEMINI_API_KEY"] | |
| ) | |
| index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model) | |
| return index.as_query_engine(llm=llm, similarity_top_k=5) | |
| # ββ Main UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.title("Scientific Paper QA System") | |
| st.markdown( | |
| "Ask questions about the indexed research papers. " | |
| "Answers are grounded in the actual paper text β not generated from memory." | |
| ) | |
| st.info("βΉοΈ Deployed version uses Gemini Flash. Local version uses Llama 3.2 via Ollama.") | |
| selected = st.selectbox("Or try an example:", EXAMPLES) | |
| example_q = "" if selected == EXAMPLES[0] else selected | |
| question = st.text_input( | |
| "Enter your question:", | |
| value=example_q, | |
| placeholder="e.g. What is the main limitation of GraphMetaMat?" | |
| ) | |
| ask_btn = st.button("Get Answer", type="primary") | |
| if ask_btn and question.strip(): | |
| try: | |
| query_engine = load_query_engine() | |
| except Exception: | |
| if "429" in str(e) or "quota" in str(e).lower(): | |
| st.error( | |
| "The AI model is temporarily unavailable due to rate limits. " | |
| "Please try again in a few minutes or tomorrow." | |
| ) | |
| else: | |
| st.error(f"Could not connect to Gemini: {e}") | |
| st.stop() | |
| with st.spinner("Searching papers and generating answer..."): | |
| response = query_engine.query(question) | |
| # temporary debug β TODO: remove after fixing | |
| st.write("Raw response:", str(response)) | |
| st.write("Source nodes found:", len(response.source_nodes)) | |
| col1, col2 = st.columns([3, 2], gap="large") | |
| with col1: | |
| st.markdown("### Answer") | |
| st.markdown(str(response)) | |
| with col2: | |
| st.markdown("### Sources") | |
| st.caption("Retrieved passages the answer is based on.") | |
| for i, node in enumerate(response.source_nodes): | |
| preview = node.text[:300].replace("\n", " ") + "..." | |
| with st.expander( | |
| f"{node.metadata.get('file_name', 'unknown')} β relevance: {node.score:.3f}", | |
| expanded=(i == 0) | |
| ): | |
| st.markdown(f"*{preview}*") | |
| st.markdown("---") | |
| st.caption( | |
| "Stack: LlamaIndex Β· Gemini Flash (deployed) / Llama 3.2 3B via Ollama (local) Β· " | |
| "BAAI/bge-small-en-v1.5 embeddings Β· ChromaDB Β· " | |
| "Evaluated on 34 domain-specific questions at top-k=5" | |
| ) |