cmd0160 commited on
Commit
9797603
·
1 Parent(s): 3a62773

Adding base files

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/material_theme_project_new.xml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="MaterialThemeProjectNewConfig">
4
+ <option name="metadata">
5
+ <MTProjectMetadataState>
6
+ <option name="migrated" value="true" />
7
+ <option name="pristineConfig" value="false" />
8
+ <option name="userId" value="-3a906995:19986b060ad:-7ffc" />
9
+ </MTProjectMetadataState>
10
+ </option>
11
+ </component>
12
+ </project>
RAG_APP_README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Abalone RAG Chatbot
2
+
3
+ This project implements a Retrieval-Augmented Generation (RAG) chatbot about Abalone using LangChain + OpenAI with a Streamlit frontend. It's designed to be deployed on Hugging Face Spaces.
4
+
5
+ Contents
6
+ - `app.py` - Streamlit app entrypoint
7
+ - `src/ingest.py` - Ingest files from `data/` into a persisted Chroma vectorstore
8
+ - `src/vectorstore.py` - Helpers to build/load the Chroma vectorstore and return a retriever
9
+ - `src/qa_chain.py` - Build the conversational retrieval QA chain
10
+ - `data/` - Put Abalone source files here (CSV/MD/TXT/PDF)
11
+ - `vectorstore/` - Persisted vectorstore directory (created by ingestion)
12
+
13
+ Quickstart (local)
14
+
15
+ 1. Create a venv and install dependencies:
16
+
17
+ ```bash
18
+ python -m venv .venv
19
+ source .venv/bin/activate
20
+ pip install -r requirements.txt
21
+ ```
22
+
23
+ 2. Set your OpenAI API key:
24
+
25
+ ```bash
26
+ export OPENAI_API_KEY="sk-..."
27
+ ```
28
+
29
+ 3. Add Abalone files into `data/` (for example `abalone.csv`).
30
+
31
+ 4. Build the vectorstore:
32
+
33
+ ```bash
34
+ python -m src.ingest --data-dir ./data --persist-dir ./vectorstore
35
+ ```
36
+
37
+ 5. Run the Streamlit app:
38
+
39
+ ```bash
40
+ streamlit run app.py
41
+ ```
42
+
43
+ Deploying to Hugging Face Spaces
44
+
45
+ - Add `OPENAI_API_KEY` in the Spaces secrets (Settings -> Secrets).
46
+ - Push this repository to your HF Space. HF will install `requirements.txt` and run the Streamlit app.
47
+ - On first run, click the "Ingest data" button or allow the app to rebuild the index.
48
+
49
+ Security
50
+ - Do NOT commit your OpenAI API key. Use HF Spaces Secrets for deployment.
51
+
52
+ License
53
+ - MIT
54
+
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit app for Abalone RAG chatbot."""
2
+ import os
3
+ os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
4
+ os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
5
+ os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
6
+
7
+ import streamlit as st
8
+
9
+ from src.vectorstore import get_retriever
10
+ from src.qa_chain import make_conversational_chain
11
+
12
+
13
+ st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
14
+
15
+ st.title("Abalone RAG Chatbot")
16
+
17
+ if "chat_history" not in st.session_state:
18
+ st.session_state["chat_history"] = []
19
+
20
+ with st.sidebar:
21
+ st.header("Settings")
22
+ model_name = st.selectbox("Model", ["gpt-3.5-turbo", "gpt-4"], index=0)
23
+ top_k = st.number_input("Retriever top_k", min_value=1, max_value=10, value=4)
24
+ if st.button("Rebuild vectorstore (ingest)"):
25
+ st.info("Rebuild requested. Run ingestion script or push data to trigger rebuild.")
26
+
27
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
28
+ if not OPENAI_API_KEY:
29
+ st.error("OPENAI_API_KEY not found. Set the OPENAI_API_KEY environment variable or add it to Hugging Face Spaces Secrets.")
30
+ st.stop()
31
+
32
+ persist_dir = "./vectorstore"
33
+ retriever = None
34
+ try:
35
+ retriever = get_retriever(persist_dir=persist_dir, top_k=top_k)
36
+ except Exception as e:
37
+ st.warning("Vectorstore not found or not initialized. Please run the ingestion script to build it.\n" + str(e))
38
+
39
+
40
+ if retriever:
41
+ chain = make_conversational_chain(retriever, model_name=model_name)
42
+
43
+ user_input = st.text_input("Ask a question about Abalone", key="input")
44
+ if st.button("Send") and user_input:
45
+ with st.spinner("Thinking..."):
46
+ prior_history = [(h.get("question"), h.get("answer", "")) for h in st.session_state.get("chat_history", [])]
47
+ result = chain({"question": user_input, "chat_history": prior_history})
48
+ answer = result.get("answer") or result.get("output_text") or ""
49
+ source_docs = result.get("source_documents") or []
50
+ st.session_state.setdefault("chat_history", [])
51
+ st.session_state["chat_history"].append({"question": user_input, "answer": answer, "sources": source_docs})
52
+
53
+ if st.session_state.get("chat_history"):
54
+ for item in reversed(st.session_state.get("chat_history", [])):
55
+ st.markdown(f"**User:** {item.get('question')}")
56
+ st.markdown(f"**Assistant:** {item.get('answer')}")
57
+ sources = item.get("sources") or []
58
+ if sources:
59
+ with st.expander("Sources"):
60
+ for sd in sources:
61
+ if isinstance(sd, dict):
62
+ meta = sd.get("metadata", {})
63
+ content_preview = sd.get("page_content") or sd.get("content") or sd.get("text", "")
64
+ else:
65
+ meta = getattr(sd, "metadata", {}) or {}
66
+ content_preview = getattr(sd, "page_content", None)
67
+ if content_preview is None:
68
+ content_preview = getattr(sd, "content", "")
69
+ st.write(meta)
70
+ if content_preview:
71
+ try:
72
+ st.write(content_preview[:400])
73
+ except Exception:
74
+ st.write(str(content_preview))
75
+ else:
76
+ st.info("No retriever available. Ingest data first.")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.354
2
+ chromadb==0.3.29
3
+ openai==0.27.0
4
+ tiktoken==0.4.0
5
+ pypdf==3.8.0
6
+ pandas==1.5.3
7
+ numpy==1.24.4
8
+ streamlit==1.20.0
9
+ python-dotenv==1.0.0
10
+ pytest==7.2.0
src/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __all__ = []
2
+
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (170 Bytes). View file
 
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (164 Bytes). View file
 
src/__pycache__/ingest.cpython-310.pyc ADDED
Binary file (2.68 kB). View file
 
src/__pycache__/ingest.cpython-313.pyc ADDED
Binary file (4.95 kB). View file
 
src/__pycache__/qa_chain.cpython-310.pyc ADDED
Binary file (1.46 kB). View file
 
src/__pycache__/vectorstore.cpython-310.pyc ADDED
Binary file (1.19 kB). View file
 
src/ingest.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ingest documents from data/ into a Chroma vectorstore using OpenAI embeddings.
2
+
3
+ Usage:
4
+ python -m src.ingest --data-dir ./data --persist-dir ./vectorstore
5
+ """
6
+ import os
7
+ os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
8
+ os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
9
+ os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
10
+
11
+ import argparse
12
+ from typing import List
13
+
14
+ from langchain.document_loaders import TextLoader, CSVLoader, PyPDFLoader
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ try:
17
+ from langchain_openai import OpenAIEmbeddings
18
+ except Exception:
19
+ from langchain.embeddings import OpenAIEmbeddings
20
+
21
+ from langchain.vectorstores import Chroma
22
+
23
+
24
+ def load_documents_from_dir(data_dir: str) -> List:
25
+ docs = []
26
+ for fname in sorted(os.listdir(data_dir)):
27
+ path = os.path.join(data_dir, fname)
28
+ if os.path.isdir(path):
29
+ continue
30
+ if fname.lower().endswith((".txt", ".md")):
31
+ loader = TextLoader(path, encoding="utf-8")
32
+ docs.extend(loader.load())
33
+ elif fname.lower().endswith(".csv"):
34
+ loader = CSVLoader(path, encoding="utf-8")
35
+ docs.extend(loader.load())
36
+ elif fname.lower().endswith(".pdf"):
37
+ try:
38
+ loader = PyPDFLoader(path)
39
+ docs.extend(loader.load())
40
+ except Exception:
41
+ print(f"Warning: Could not load PDF {path}. Ensure pypdf is installed.")
42
+ else:
43
+ print(f"Skipping unknown file type: {path}")
44
+ return docs
45
+
46
+
47
+ def ingest(data_dir: str = "./data", persist_dir: str = "./vectorstore", chunk_size: int = 1000, chunk_overlap: int = 200):
48
+ assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment"
49
+ print(f"Loading documents from {data_dir}")
50
+ docs = load_documents_from_dir(data_dir)
51
+ print(f"Loaded {len(docs)} documents")
52
+
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
54
+ split_docs = text_splitter.split_documents(docs)
55
+ print(f"Split into {len(split_docs)} chunks")
56
+
57
+ embeddings = OpenAIEmbeddings()
58
+ os.makedirs(persist_dir, exist_ok=True)
59
+
60
+ db = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_dir)
61
+ db.persist()
62
+ print(f"Vectorstore persisted to {persist_dir}")
63
+
64
+
65
+ if __name__ == "__main__":
66
+ parser = argparse.ArgumentParser()
67
+ parser.add_argument("--data-dir", type=str, default="./data")
68
+ parser.add_argument("--persist-dir", type=str, default="./vectorstore")
69
+ parser.add_argument("--chunk-size", type=int, default=1000)
70
+ parser.add_argument("--chunk-overlap", type=int, default=200)
71
+ args = parser.parse_args()
72
+ ingest(args.data_dir, args.persist_dir, args.chunk_size, args.chunk_overlap)
src/qa_chain.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create a conversational retrieval QA chain using LangChain and OpenAI.
2
+ """
3
+ import os
4
+ os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
5
+ os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
6
+
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+
11
+
12
+ def make_conversational_chain(retriever, model_name: str = "gpt-3.5-turbo", temperature: float = 0.0):
13
+ """Return a ConversationalRetrievalChain configured with ChatOpenAI and a ConversationBufferMemory.
14
+
15
+ The ConversationBufferMemory is configured with output_key='answer' so that when the chain
16
+ returns multiple outputs (for example 'answer' and 'source_documents'), the memory will pick the
17
+ single 'answer' field to store in the chat history.
18
+ """
19
+ assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY must be set in environment"
20
+ llm = ChatOpenAI(model_name=model_name, temperature=temperature)
21
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
22
+ chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory, return_source_documents=True)
23
+ return chain
src/vectorstore.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers to load or build a Chroma vectorstore using LangChain.
2
+ """
3
+ import os
4
+ os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
5
+ os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
6
+
7
+ try:
8
+ from langchain_openai import OpenAIEmbeddings
9
+ except Exception:
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+
12
+ from langchain.vectorstores import Chroma
13
+
14
+
15
+ def load_vectorstore(persist_dir: str = "./vectorstore"):
16
+ if not os.path.exists(persist_dir):
17
+ raise FileNotFoundError(f"Persist directory {persist_dir} does not exist. Build the vectorstore first.")
18
+ embeddings = OpenAIEmbeddings()
19
+ db = Chroma(persist_directory=persist_dir, embedding_function=embeddings)
20
+ return db
21
+
22
+
23
+ def get_retriever(persist_dir: str = "./vectorstore", top_k: int = 4):
24
+ db = load_vectorstore(persist_dir)
25
+ retriever = db.as_retriever(search_kwargs={"k": top_k})
26
+ return retriever
tests/__pycache__/test_imports.cpython-310-pytest-7.2.0.pyc ADDED
Binary file (1.84 kB). View file
 
tests/test_imports.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ project_root = Path(__file__).resolve().parents[1]
6
+ if str(project_root) not in sys.path:
7
+ sys.path.insert(0, str(project_root))
8
+
9
+
10
+ def test_module_imports_and_symbols():
11
+ import importlib
12
+
13
+ modules = [
14
+ "src.ingest",
15
+ "src.vectorstore",
16
+ "src.qa_chain",
17
+ ]
18
+ for mod_name in modules:
19
+ mod = importlib.import_module(mod_name)
20
+ assert mod is not None
21
+
22
+ from src import ingest as ingest_mod
23
+ from src.vectorstore import get_retriever
24
+ from src.qa_chain import make_conversational_chain
25
+
26
+ assert callable(ingest_mod.ingest)
27
+ assert callable(get_retriever)
28
+ assert callable(make_conversational_chain)