Adeen commited on
Commit
f018f6e
·
0 Parent(s):

Initial deployment of Source.AI premium platform

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. .gitignore +14 -0
  3. README.md +11 -0
  4. app.py +210 -0
  5. ingest.py +52 -0
  6. requirements.txt +8 -0
  7. run_app.bat +30 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Python Cache
3
+ __pycache__/
4
+ *.pyc
5
+
6
+ # Python Packages & Virtual Environments
7
+ venv/
8
+ .venv/
9
+ env/
10
+ *.egg-info/
11
+ build/
12
+ dist/
13
+ .packages/
14
+ .env
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RAG
3
+ emoji: 🧠
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tempfile
4
+ import streamlit as st
5
+
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
14
+
15
+ # Configuration
16
+ CHROMA_DIR = "chroma_db"
17
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
18
+ APP_TITLE = "Source.AI"
19
+ APP_SUBTITLE = "SOURCE TO YOUR STUDIES"
20
+
21
+ # Custom CSS for Premium UI
22
+ PREMIUM_STYLE = """
23
+ <style>
24
+ .main {
25
+ background-color: #0e1117;
26
+ }
27
+ .stApp {
28
+ background: linear-gradient(135deg, #0e1117 0%, #1a1c24 100%);
29
+ }
30
+ .sidebar .sidebar-content {
31
+ background-color: #1a1c24;
32
+ }
33
+ h1 {
34
+ color: #ffffff;
35
+ font-family: 'Inter', sans-serif;
36
+ font-weight: 700;
37
+ letter-spacing: -1px;
38
+ }
39
+ .stChatMessage {
40
+ background-color: #1e222d;
41
+ border-radius: 10px;
42
+ border: 1px solid #30363d;
43
+ margin-bottom: 10px;
44
+ }
45
+ .stChatInputContainer {
46
+ border-radius: 10px;
47
+ border: 1px solid #30363d;
48
+ }
49
+ .css-1offfwp {
50
+ background-color: #238636 !important;
51
+ }
52
+ .stButton>button {
53
+ width: 100%;
54
+ border-radius: 8px;
55
+ border: 1px solid #30363d;
56
+ background-color: #21262d;
57
+ color: #c9d1d9;
58
+ transition: all 0.2s;
59
+ }
60
+ .stButton>button:hover {
61
+ background-color: #30363d;
62
+ border-color: #8b949e;
63
+ }
64
+ </style>
65
+ """
66
+
67
+ PROMPT_TEMPLATE = (
68
+ "You are a sophisticated Study Assistant. Use the provided context to answer the student's question accurately. "
69
+ "If the answer isn't in the context, politely state that you don't know based on the available materials. "
70
+ "\n\n"
71
+ "Context:\n{context}\n\n"
72
+ "Question: {question}"
73
+ )
74
+
75
+ @st.cache_resource
76
+ def load_vectorstore() -> Chroma:
77
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
78
+ vectorstore = Chroma(
79
+ persist_directory=CHROMA_DIR,
80
+ embedding_function=embeddings,
81
+ )
82
+ return vectorstore
83
+
84
+ @st.cache_resource
85
+ def get_llm(api_key: str) -> ChatGoogleGenerativeAI:
86
+ # We use Gemini as the backend, but we don't need to brand it in the UI
87
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
88
+ return llm
89
+
90
+ def build_context(chunks) -> str:
91
+ return "\n\n".join(chunk.page_content for chunk in chunks)
92
+
93
+ def main() -> None:
94
+ st.set_page_config(page_title=APP_TITLE, page_icon="📚", layout="wide")
95
+ st.markdown(PREMIUM_STYLE, unsafe_allow_html=True)
96
+
97
+ # Sidebar Header
98
+ with st.sidebar:
99
+ st.title(f"🔍 {APP_TITLE}")
100
+ st.markdown(f"**{APP_SUBTITLE}**")
101
+ st.divider()
102
+
103
+ # Tools
104
+ if st.button("🗑️ Reset Conversation"):
105
+ st.session_state["messages"] = []
106
+ st.rerun()
107
+
108
+ st.divider()
109
+
110
+ # Knowledge Base Management
111
+ st.subheader("📚 Knowledge Base")
112
+ uploaded_file = st.file_uploader("Upload course material (PDF)", type=["pdf"])
113
+
114
+ if "processed_files" not in st.session_state:
115
+ st.session_state["processed_files"] = set()
116
+
117
+ # Initialize vectorstore
118
+ try:
119
+ vectorstore = load_vectorstore()
120
+ except Exception as exc:
121
+ st.error(f"Engine Error: {exc}")
122
+ return
123
+
124
+ if uploaded_file is not None:
125
+ if uploaded_file.name not in st.session_state["processed_files"]:
126
+ with st.spinner("Analyzing and indexing document..."):
127
+ tmp_path = None
128
+ try:
129
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
130
+ tmp_file.write(uploaded_file.getbuffer())
131
+ tmp_path = tmp_file.name
132
+
133
+ loader = PyPDFLoader(tmp_path)
134
+ documents = loader.load()
135
+
136
+ splitter = RecursiveCharacterTextSplitter(
137
+ chunk_size=700,
138
+ chunk_overlap=100,
139
+ )
140
+ splits = splitter.split_documents(documents)
141
+ vectorstore.add_documents(splits)
142
+
143
+ st.session_state["processed_files"].add(uploaded_file.name)
144
+ st.success("Document added to knowledge base.")
145
+ except Exception as exc:
146
+ st.error(f"Indexing Error: {exc}")
147
+ finally:
148
+ if tmp_path and os.path.exists(tmp_path):
149
+ os.remove(tmp_path)
150
+ else:
151
+ st.info(f"'{uploaded_file.name}' is indexed.")
152
+
153
+ # Main UI
154
+ st.title(f"🎓 {APP_TITLE}")
155
+ st.markdown(f"*{APP_SUBTITLE}*")
156
+
157
+ # Initialize messages
158
+ if "messages" not in st.session_state:
159
+ st.session_state["messages"] = []
160
+
161
+ # API Key Handling
162
+ api_key = os.environ.get("GEMINI_API_KEY")
163
+ if not api_key:
164
+ st.warning("⚠️ Backend connection not established. Please check your configuration.")
165
+ return
166
+
167
+ try:
168
+ llm = get_llm(api_key)
169
+ except Exception as exc:
170
+ st.error(f"Intelligence Engine Error: {exc}")
171
+ return
172
+
173
+ # Chat Display
174
+ for message in st.session_state["messages"]:
175
+ with st.chat_message(message["role"]):
176
+ st.markdown(message["content"])
177
+
178
+ # Chat Input
179
+ user_input = st.chat_input("Ask anything about your studies...")
180
+
181
+ if user_input:
182
+ st.session_state["messages"].append({"role": "user", "content": user_input})
183
+ with st.chat_message("user"):
184
+ st.markdown(user_input)
185
+
186
+ with st.chat_message("assistant"):
187
+ placeholder = st.empty()
188
+ placeholder.markdown("🔍 Analyzing documents...")
189
+
190
+ try:
191
+ # Retrieve relevant context
192
+ docs = vectorstore.similarity_search(user_input, k=4)
193
+
194
+ if not docs:
195
+ answer = "I couldn't find any relevant information in your current study materials."
196
+ else:
197
+ context = build_context(docs)
198
+ filled_prompt = PROMPT_TEMPLATE.format(context=context, question=user_input)
199
+
200
+ response = llm.invoke(filled_prompt)
201
+ answer = response.content
202
+
203
+ placeholder.markdown(answer)
204
+ st.session_state["messages"].append({"role": "assistant", "content": answer})
205
+
206
+ except Exception as exc:
207
+ placeholder.markdown(f"⚠️ Service interruption: {exc}")
208
+
209
+ if __name__ == "__main__":
210
+ main()
ingest.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+
9
+ DATA_DIR = "data"
10
+ CHROMA_DIR = "chroma_db"
11
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
12
+
13
+ def main() -> None:
14
+ print("Starting ingestion pipeline...")
15
+
16
+ if not os.path.isdir(DATA_DIR):
17
+ print(f"Data directory '{DATA_DIR}' does not exist. Please create it and add PDFs.")
18
+ sys.exit(1)
19
+
20
+ print(f"Loading PDFs from '{DATA_DIR}'...")
21
+ loader = PyPDFDirectoryLoader(DATA_DIR)
22
+ documents = loader.load()
23
+
24
+ if not documents:
25
+ print(f"No PDF documents found in '{DATA_DIR}'. Add PDFs and run again.")
26
+ sys.exit(0)
27
+
28
+ print(f"Loaded {len(documents)} documents. Splitting into chunks...")
29
+ text_splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=500,
31
+ chunk_overlap=50,
32
+ )
33
+ splits = text_splitter.split_documents(documents)
34
+ print(f"Created {len(splits)} text chunks.")
35
+
36
+ print(f"Initializing embeddings model '{EMBEDDING_MODEL_NAME}'...")
37
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
38
+
39
+ print(f"Creating Chroma database in '{CHROMA_DIR}'...")
40
+ vectorstore = Chroma.from_documents(
41
+ documents=splits,
42
+ embedding=embeddings,
43
+ persist_directory=CHROMA_DIR,
44
+ )
45
+
46
+ print("Persisting Chroma database to disk...")
47
+ vectorstore.persist()
48
+
49
+ print(f"Database successfully created and stored in '{CHROMA_DIR}'.")
50
+
51
+ if __name__ == "__main__":
52
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ chromadb
4
+ pypdf
5
+ sentence-transformers
6
+ streamlit
7
+ langchain-google-genai
8
+ python-dotenv
run_app.bat ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ setlocal
3
+
4
+ cd /d "%~dp0"
5
+
6
+ echo Loading from .env...
7
+
8
+ echo Installing Python dependencies...
9
+ python -m pip install --upgrade pip >nul 2>&1
10
+ python -m pip install -r requirements.txt
11
+ if errorlevel 1 (
12
+ echo [ERROR] Failed to install dependencies.
13
+ exit /b 1
14
+ )
15
+
16
+ if exist "chroma_db" (
17
+ echo Found existing Chroma database. Skipping ingestion.
18
+ ) else (
19
+ echo No Chroma database found. Running ingestion...
20
+ python ingest.py
21
+ if errorlevel 1 (
22
+ echo [ERROR] Ingestion failed.
23
+ exit /b 1
24
+ )
25
+ )
26
+
27
+ echo Starting Streamlit app...
28
+ python -m streamlit run app.py
29
+
30
+ endlocal