Spaces:

Aynkhn
/

RAG

Sleeping

App Files Files Community

Adeen commited on Apr 25

Commit

f018f6e

0 Parent(s):

Initial deployment of Source.AI premium platform

Browse files

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +14 -0
README.md +11 -0
app.py +210 -0
ingest.py +52 -0
requirements.txt +8 -0
run_app.bat +30 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Python Cache
+__pycache__/
+*.pyc
+# Python Packages & Virtual Environments
+venv/
+.venv/
+env/
+*.egg-info/
+build/
+dist/
+.packages/
+.env

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: RAG
+emoji: 🧠
+colorFrom: gray
+colorTo: gray
+sdk: streamlit
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+import sys
+import tempfile
+import streamlit as st
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# Configuration
+CHROMA_DIR = "chroma_db"
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+APP_TITLE = "Source.AI"
+APP_SUBTITLE = "SOURCE TO YOUR STUDIES"
+# Custom CSS for Premium UI
+PREMIUM_STYLE = """
+<style>
+    .main {
+        background-color: #0e1117;
+    }
+    .stApp {
+        background: linear-gradient(135deg, #0e1117 0%, #1a1c24 100%);
+    }
+    .sidebar .sidebar-content {
+        background-color: #1a1c24;
+    }
+    h1 {
+        color: #ffffff;
+        font-family: 'Inter', sans-serif;
+        font-weight: 700;
+        letter-spacing: -1px;
+    }
+    .stChatMessage {
+        background-color: #1e222d;
+        border-radius: 10px;
+        border: 1px solid #30363d;
+        margin-bottom: 10px;
+    }
+    .stChatInputContainer {
+        border-radius: 10px;
+        border: 1px solid #30363d;
+    }
+    .css-1offfwp {
+        background-color: #238636 !important;
+    }
+    .stButton>button {
+        width: 100%;
+        border-radius: 8px;
+        border: 1px solid #30363d;
+        background-color: #21262d;
+        color: #c9d1d9;
+        transition: all 0.2s;
+    }
+    .stButton>button:hover {
+        background-color: #30363d;
+        border-color: #8b949e;
+    }
+</style>
+"""
+PROMPT_TEMPLATE = (
+    "You are a sophisticated Study Assistant. Use the provided context to answer the student's question accurately. "
+    "If the answer isn't in the context, politely state that you don't know based on the available materials. "
+    "\n\n"
+    "Context:\n{context}\n\n"
+    "Question: {question}"
+)
+@st.cache_resource
+def load_vectorstore() -> Chroma:
+    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    vectorstore = Chroma(
+        persist_directory=CHROMA_DIR,
+        embedding_function=embeddings,
+    )
+    return vectorstore
+@st.cache_resource
+def get_llm(api_key: str) -> ChatGoogleGenerativeAI:
+    # We use Gemini as the backend, but we don't need to brand it in the UI
+    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
+    return llm
+def build_context(chunks) -> str:
+    return "\n\n".join(chunk.page_content for chunk in chunks)
+def main() -> None:
+    st.set_page_config(page_title=APP_TITLE, page_icon="📚", layout="wide")
+    st.markdown(PREMIUM_STYLE, unsafe_allow_html=True)
+    # Sidebar Header
+    with st.sidebar:
+        st.title(f"🔍 {APP_TITLE}")
+        st.markdown(f"**{APP_SUBTITLE}**")
+        st.divider()
+        # Tools
+        if st.button("🗑️ Reset Conversation"):
+            st.session_state["messages"] = []
+            st.rerun()
+        st.divider()
+        # Knowledge Base Management
+        st.subheader("📚 Knowledge Base")
+        uploaded_file = st.file_uploader("Upload course material (PDF)", type=["pdf"])
+        if "processed_files" not in st.session_state:
+            st.session_state["processed_files"] = set()
+        # Initialize vectorstore
+        try:
+            vectorstore = load_vectorstore()
+        except Exception as exc:
+            st.error(f"Engine Error: {exc}")
+            return
+        if uploaded_file is not None:
+            if uploaded_file.name not in st.session_state["processed_files"]:
+                with st.spinner("Analyzing and indexing document..."):
+                    tmp_path = None
+                    try:
+                        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                            tmp_file.write(uploaded_file.getbuffer())
+                            tmp_path = tmp_file.name
+                        loader = PyPDFLoader(tmp_path)
+                        documents = loader.load()
+                        splitter = RecursiveCharacterTextSplitter(
+                            chunk_size=700,
+                            chunk_overlap=100,
+                        )
+                        splits = splitter.split_documents(documents)
+                        vectorstore.add_documents(splits)
+                        st.session_state["processed_files"].add(uploaded_file.name)
+                        st.success("Document added to knowledge base.")
+                    except Exception as exc:
+                        st.error(f"Indexing Error: {exc}")
+                    finally:
+                        if tmp_path and os.path.exists(tmp_path):
+                            os.remove(tmp_path)
+            else:
+                st.info(f"'{uploaded_file.name}' is indexed.")
+    # Main UI
+    st.title(f"🎓 {APP_TITLE}")
+    st.markdown(f"*{APP_SUBTITLE}*")
+    # Initialize messages
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = []
+    # API Key Handling
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        st.warning("⚠️ Backend connection not established. Please check your configuration.")
+        return
+    try:
+        llm = get_llm(api_key)
+    except Exception as exc:
+        st.error(f"Intelligence Engine Error: {exc}")
+        return
+    # Chat Display
+    for message in st.session_state["messages"]:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat Input
+    user_input = st.chat_input("Ask anything about your studies...")
+    if user_input:
+        st.session_state["messages"].append({"role": "user", "content": user_input})
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        with st.chat_message("assistant"):
+            placeholder = st.empty()
+            placeholder.markdown("🔍 Analyzing documents...")
+            try:
+                # Retrieve relevant context
+                docs = vectorstore.similarity_search(user_input, k=4)
+                if not docs:
+                    answer = "I couldn't find any relevant information in your current study materials."
+                else:
+                    context = build_context(docs)
+                    filled_prompt = PROMPT_TEMPLATE.format(context=context, question=user_input)
+                    response = llm.invoke(filled_prompt)
+                    answer = response.content
+                placeholder.markdown(answer)
+                st.session_state["messages"].append({"role": "assistant", "content": answer})
+            except Exception as exc:
+                placeholder.markdown(f"⚠️ Service interruption: {exc}")
+if __name__ == "__main__":
+    main()

ingest.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import sys
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+DATA_DIR = "data"
+CHROMA_DIR = "chroma_db"
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+def main() -> None:
+    print("Starting ingestion pipeline...")
+    if not os.path.isdir(DATA_DIR):
+        print(f"Data directory '{DATA_DIR}' does not exist. Please create it and add PDFs.")
+        sys.exit(1)
+    print(f"Loading PDFs from '{DATA_DIR}'...")
+    loader = PyPDFDirectoryLoader(DATA_DIR)
+    documents = loader.load()
+    if not documents:
+        print(f"No PDF documents found in '{DATA_DIR}'. Add PDFs and run again.")
+        sys.exit(0)
+    print(f"Loaded {len(documents)} documents. Splitting into chunks...")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+    )
+    splits = text_splitter.split_documents(documents)
+    print(f"Created {len(splits)} text chunks.")
+    print(f"Initializing embeddings model '{EMBEDDING_MODEL_NAME}'...")
+    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
+    print(f"Creating Chroma database in '{CHROMA_DIR}'...")
+    vectorstore = Chroma.from_documents(
+        documents=splits,
+        embedding=embeddings,
+        persist_directory=CHROMA_DIR,
+    )
+    print("Persisting Chroma database to disk...")
+    vectorstore.persist()
+    print(f"Database successfully created and stored in '{CHROMA_DIR}'.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+langchain
+langchain-community
+chromadb
+pypdf
+sentence-transformers
+streamlit
+langchain-google-genai
+python-dotenv

run_app.bat ADDED Viewed

	@@ -0,0 +1,30 @@

+@echo off
+setlocal
+cd /d "%~dp0"
+echo Loading from .env...
+echo Installing Python dependencies...
+python -m pip install --upgrade pip >nul 2>&1
+python -m pip install -r requirements.txt
+if errorlevel 1 (
+  echo [ERROR] Failed to install dependencies.
+  exit /b 1
+)
+if exist "chroma_db" (
+  echo Found existing Chroma database. Skipping ingestion.
+) else (
+  echo No Chroma database found. Running ingestion...
+  python ingest.py
+  if errorlevel 1 (
+    echo [ERROR] Ingestion failed.
+    exit /b 1
+  )
+)
+echo Starting Streamlit app...
+python -m streamlit run app.py
+endlocal