Spaces:

cmd0160
/

abalone_chat_application

Sleeping

App Files Files Community

cmd0160 commited on Dec 6, 2025

Commit

9797603

1 Parent(s): 3a62773

Adding base files

Browse files

Files changed (17) hide show

.idea/.gitignore +8 -0
.idea/material_theme_project_new.xml +12 -0
RAG_APP_README.md +54 -0
app.py +76 -0
requirements.txt +10 -0
src/__init__.py +2 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/ingest.cpython-310.pyc +0 -0
src/__pycache__/ingest.cpython-313.pyc +0 -0
src/__pycache__/qa_chain.cpython-310.pyc +0 -0
src/__pycache__/vectorstore.cpython-310.pyc +0 -0
src/ingest.py +72 -0
src/qa_chain.py +23 -0
src/vectorstore.py +26 -0
tests/__pycache__/test_imports.cpython-310-pytest-7.2.0.pyc +0 -0
tests/test_imports.py +28 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/material_theme_project_new.xml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MaterialThemeProjectNewConfig">
+    <option name="metadata">
+      <MTProjectMetadataState>
+        <option name="migrated" value="true" />
+        <option name="pristineConfig" value="false" />
+        <option name="userId" value="-3a906995:19986b060ad:-7ffc" />
+      </MTProjectMetadataState>
+    </option>
+  </component>
+</project>

RAG_APP_README.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# Abalone RAG Chatbot
+This project implements a Retrieval-Augmented Generation (RAG) chatbot about Abalone using LangChain + OpenAI with a Streamlit frontend. It's designed to be deployed on Hugging Face Spaces.
+Contents
+- `app.py` - Streamlit app entrypoint
+- `src/ingest.py` - Ingest files from `data/` into a persisted Chroma vectorstore
+- `src/vectorstore.py` - Helpers to build/load the Chroma vectorstore and return a retriever
+- `src/qa_chain.py` - Build the conversational retrieval QA chain
+- `data/` - Put Abalone source files here (CSV/MD/TXT/PDF)
+- `vectorstore/` - Persisted vectorstore directory (created by ingestion)
+Quickstart (local)
+1. Create a venv and install dependencies:
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+2. Set your OpenAI API key:
+```bash
+export OPENAI_API_KEY="sk-..."
+```
+3. Add Abalone files into `data/` (for example `abalone.csv`).
+4. Build the vectorstore:
+```bash
+python -m src.ingest --data-dir ./data --persist-dir ./vectorstore
+```
+5. Run the Streamlit app:
+```bash
+streamlit run app.py
+```
+Deploying to Hugging Face Spaces
+- Add `OPENAI_API_KEY` in the Spaces secrets (Settings -> Secrets).
+- Push this repository to your HF Space. HF will install `requirements.txt` and run the Streamlit app.
+- On first run, click the "Ingest data" button or allow the app to rebuild the index.
+Security
+- Do NOT commit your OpenAI API key. Use HF Spaces Secrets for deployment.
+License
+- MIT

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Streamlit app for Abalone RAG chatbot."""
+import os
+os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
+os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
+os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
+import streamlit as st
+from src.vectorstore import get_retriever
+from src.qa_chain import make_conversational_chain
+st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
+st.title("Abalone RAG Chatbot")
+if "chat_history" not in st.session_state:
+    st.session_state["chat_history"] = []
+with st.sidebar:
+    st.header("Settings")
+    model_name = st.selectbox("Model", ["gpt-3.5-turbo", "gpt-4"], index=0)
+    top_k = st.number_input("Retriever top_k", min_value=1, max_value=10, value=4)
+    if st.button("Rebuild vectorstore (ingest)"):
+        st.info("Rebuild requested. Run ingestion script or push data to trigger rebuild.")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    st.error("OPENAI_API_KEY not found. Set the OPENAI_API_KEY environment variable or add it to Hugging Face Spaces Secrets.")
+    st.stop()
+persist_dir = "./vectorstore"
+retriever = None
+try:
+    retriever = get_retriever(persist_dir=persist_dir, top_k=top_k)
+except Exception as e:
+    st.warning("Vectorstore not found or not initialized. Please run the ingestion script to build it.\n" + str(e))
+if retriever:
+    chain = make_conversational_chain(retriever, model_name=model_name)
+    user_input = st.text_input("Ask a question about Abalone", key="input")
+    if st.button("Send") and user_input:
+        with st.spinner("Thinking..."):
+            prior_history = [(h.get("question"), h.get("answer", "")) for h in st.session_state.get("chat_history", [])]
+            result = chain({"question": user_input, "chat_history": prior_history})
+            answer = result.get("answer") or result.get("output_text") or ""
+            source_docs = result.get("source_documents") or []
+        st.session_state.setdefault("chat_history", [])
+        st.session_state["chat_history"].append({"question": user_input, "answer": answer, "sources": source_docs})
+    if st.session_state.get("chat_history"):
+        for item in reversed(st.session_state.get("chat_history", [])):
+            st.markdown(f"**User:** {item.get('question')}")
+            st.markdown(f"**Assistant:** {item.get('answer')}")
+            sources = item.get("sources") or []
+            if sources:
+                with st.expander("Sources"):
+                    for sd in sources:
+                        if isinstance(sd, dict):
+                            meta = sd.get("metadata", {})
+                            content_preview = sd.get("page_content") or sd.get("content") or sd.get("text", "")
+                        else:
+                            meta = getattr(sd, "metadata", {}) or {}
+                            content_preview = getattr(sd, "page_content", None)
+                            if content_preview is None:
+                                content_preview = getattr(sd, "content", "")
+                        st.write(meta)
+                        if content_preview:
+                            try:
+                                st.write(content_preview[:400])
+                            except Exception:
+                                st.write(str(content_preview))
+else:
+    st.info("No retriever available. Ingest data first.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+langchain==0.0.354
+chromadb==0.3.29
+openai==0.27.0
+tiktoken==0.4.0
+pypdf==3.8.0
+pandas==1.5.3
+numpy==1.24.4
+streamlit==1.20.0
+python-dotenv==1.0.0
+pytest==7.2.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __all__ = []
2	+

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (170 Bytes). View file

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (164 Bytes). View file

src/__pycache__/ingest.cpython-310.pyc ADDED Viewed

Binary file (2.68 kB). View file

src/__pycache__/ingest.cpython-313.pyc ADDED Viewed

Binary file (4.95 kB). View file

src/__pycache__/qa_chain.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

src/__pycache__/vectorstore.cpython-310.pyc ADDED Viewed

Binary file (1.19 kB). View file

src/ingest.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Ingest documents from data/ into a Chroma vectorstore using OpenAI embeddings.
+Usage:
+    python -m src.ingest --data-dir ./data --persist-dir ./vectorstore
+"""
+import os
+os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
+os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
+os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
+import argparse
+from typing import List
+from langchain.document_loaders import TextLoader, CSVLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+try:
+    from langchain_openai import OpenAIEmbeddings
+except Exception:
+    from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+def load_documents_from_dir(data_dir: str) -> List:
+    docs = []
+    for fname in sorted(os.listdir(data_dir)):
+        path = os.path.join(data_dir, fname)
+        if os.path.isdir(path):
+            continue
+        if fname.lower().endswith((".txt", ".md")):
+            loader = TextLoader(path, encoding="utf-8")
+            docs.extend(loader.load())
+        elif fname.lower().endswith(".csv"):
+            loader = CSVLoader(path, encoding="utf-8")
+            docs.extend(loader.load())
+        elif fname.lower().endswith(".pdf"):
+            try:
+                loader = PyPDFLoader(path)
+                docs.extend(loader.load())
+            except Exception:
+                print(f"Warning: Could not load PDF {path}. Ensure pypdf is installed.")
+        else:
+            print(f"Skipping unknown file type: {path}")
+    return docs
+def ingest(data_dir: str = "./data", persist_dir: str = "./vectorstore", chunk_size: int = 1000, chunk_overlap: int = 200):
+    assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment"
+    print(f"Loading documents from {data_dir}")
+    docs = load_documents_from_dir(data_dir)
+    print(f"Loaded {len(docs)} documents")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    split_docs = text_splitter.split_documents(docs)
+    print(f"Split into {len(split_docs)} chunks")
+    embeddings = OpenAIEmbeddings()
+    os.makedirs(persist_dir, exist_ok=True)
+    db = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_dir)
+    db.persist()
+    print(f"Vectorstore persisted to {persist_dir}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", type=str, default="./data")
+    parser.add_argument("--persist-dir", type=str, default="./vectorstore")
+    parser.add_argument("--chunk-size", type=int, default=1000)
+    parser.add_argument("--chunk-overlap", type=int, default=200)
+    args = parser.parse_args()
+    ingest(args.data_dir, args.persist_dir, args.chunk_size, args.chunk_overlap)

src/qa_chain.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Create a conversational retrieval QA chain using LangChain and OpenAI.
+"""
+import os
+os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
+os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+def make_conversational_chain(retriever, model_name: str = "gpt-3.5-turbo", temperature: float = 0.0):
+    """Return a ConversationalRetrievalChain configured with ChatOpenAI and a ConversationBufferMemory.
+    The ConversationBufferMemory is configured with output_key='answer' so that when the chain
+    returns multiple outputs (for example 'answer' and 'source_documents'), the memory will pick the
+    single 'answer' field to store in the chat history.
+    """
+    assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment"
+    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
+    chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory, return_source_documents=True)
+    return chain

src/vectorstore.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Helpers to load or build a Chroma vectorstore using LangChain.
+"""
+import os
+os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
+os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
+try:
+    from langchain_openai import OpenAIEmbeddings
+except Exception:
+    from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+def load_vectorstore(persist_dir: str = "./vectorstore"):
+    if not os.path.exists(persist_dir):
+        raise FileNotFoundError(f"Persist directory {persist_dir} does not exist. Build the vectorstore first.")
+    embeddings = OpenAIEmbeddings()
+    db = Chroma(persist_directory=persist_dir, embedding_function=embeddings)
+    return db
+def get_retriever(persist_dir: str = "./vectorstore", top_k: int = 4):
+    db = load_vectorstore(persist_dir)
+    retriever = db.as_retriever(search_kwargs={"k": top_k})
+    return retriever

tests/__pycache__/test_imports.cpython-310-pytest-7.2.0.pyc ADDED Viewed

Binary file (1.84 kB). View file

tests/test_imports.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import sys
+from pathlib import Path
+project_root = Path(__file__).resolve().parents[1]
+if str(project_root) not in sys.path:
+    sys.path.insert(0, str(project_root))
+def test_module_imports_and_symbols():
+    import importlib
+    modules = [
+        "src.ingest",
+        "src.vectorstore",
+        "src.qa_chain",
+    ]
+    for mod_name in modules:
+        mod = importlib.import_module(mod_name)
+        assert mod is not None
+    from src import ingest as ingest_mod
+    from src.vectorstore import get_retriever
+    from src.qa_chain import make_conversational_chain
+    assert callable(ingest_mod.ingest)
+    assert callable(get_retriever)
+    assert callable(make_conversational_chain)