msaid1976 commited on
Commit
fae76c4
·
verified ·
1 Parent(s): 41e5012

Sync from GitHub via hub-sync

Browse files
Files changed (5) hide show
  1. Dockerfile +0 -20
  2. README.md +0 -20
  3. app.py +238 -0
  4. requirements.txt +15 -3
  5. src/streamlit_app.py +0 -40
Dockerfile DELETED
@@ -1,20 +0,0 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,20 +0,0 @@
1
- ---
2
- title: Search Engine LLM
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- license: apache-2.0
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+
4
+ import streamlit as st
5
+ from dotenv import load_dotenv
6
+ from langchain.agents import create_agent
7
+ from langchain_community.callbacks.streamlit import StreamlitCallbackHandler
8
+ from langchain_community.tools import ArxivQueryRun, DuckDuckGoSearchRun, WikipediaQueryRun
9
+ from langchain_community.utilities import ArxivAPIWrapper, WikipediaAPIWrapper
10
+ from langchain_chroma import Chroma
11
+ from langchain_core.documents import Document
12
+ from langchain_core.tools import tool
13
+ from langchain_groq import ChatGroq
14
+ from langchain_huggingface import HuggingFaceEmbeddings
15
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
16
+ from pypdf import PdfReader
17
+
18
+
19
+ load_dotenv()
20
+
21
+ if os.getenv("HF_TOKEN"):
22
+ os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
23
+
24
+
25
+ @st.cache_resource
26
+ def get_embeddings():
27
+ return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
28
+
29
+
30
+ def build_retriever(uploaded_files):
31
+ documents = []
32
+
33
+ for uploaded_file in uploaded_files:
34
+ file_bytes = uploaded_file.getvalue()
35
+
36
+ if uploaded_file.type == "application/pdf" or uploaded_file.name.lower().endswith(".pdf"):
37
+ reader = PdfReader(uploaded_file)
38
+ for page_number, page in enumerate(reader.pages, start=1):
39
+ page_text = page.extract_text() or ""
40
+ if page_text.strip():
41
+ documents.append(
42
+ Document(
43
+ page_content=page_text,
44
+ metadata={"source": uploaded_file.name, "page": page_number},
45
+ )
46
+ )
47
+ else:
48
+ text = file_bytes.decode("utf-8", errors="ignore")
49
+ if text.strip():
50
+ documents.append(
51
+ Document(
52
+ page_content=text,
53
+ metadata={"source": uploaded_file.name},
54
+ )
55
+ )
56
+
57
+ if not documents:
58
+ return None
59
+
60
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
61
+ splits = text_splitter.split_documents(documents)
62
+ vectorstore = Chroma.from_documents(documents=splits, embedding=get_embeddings())
63
+ return vectorstore.as_retriever(search_kwargs={"k": 4})
64
+
65
+
66
+ def uploaded_files_signature(uploaded_files):
67
+ digest = hashlib.sha256()
68
+ for uploaded_file in uploaded_files:
69
+ digest.update(uploaded_file.name.encode("utf-8"))
70
+ digest.update(uploaded_file.getvalue())
71
+ return digest.hexdigest()
72
+
73
+
74
+ def create_documents_tool(retriever):
75
+ @tool("uploaded_documents")
76
+ def uploaded_documents(query: str) -> str:
77
+ """Search the uploaded documents for information relevant to the user's question."""
78
+ docs = retriever.invoke(query)
79
+ if not docs:
80
+ return "No relevant uploaded document content was found."
81
+
82
+ chunks = []
83
+ for doc in docs:
84
+ source = doc.metadata.get("source", "uploaded document")
85
+ page = doc.metadata.get("page")
86
+ label = f"{source}, page {page}" if page else source
87
+ chunks.append(f"Source: {label}\n{doc.page_content}")
88
+
89
+ return "\n\n".join(chunks)
90
+
91
+ return uploaded_documents
92
+
93
+
94
+ arxiv_wrapper = ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=200)
95
+ arxiv = ArxivQueryRun(api_wrapper=arxiv_wrapper)
96
+
97
+ wiki_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=200)
98
+ wiki = WikipediaQueryRun(api_wrapper=wiki_wrapper)
99
+
100
+ search = DuckDuckGoSearchRun(name="Search")
101
+
102
+
103
+ def default_messages():
104
+ return [
105
+ {
106
+ "role": "assistant",
107
+ "content": "Hi, choose tools from the sidebar and ask me anything.",
108
+ }
109
+ ]
110
+
111
+
112
+ st.set_page_config(page_title="LangChain Enhanced Tools Chat", page_icon="🔎")
113
+ st.title("🔎 LangChain Chat with Selectable Tools")
114
+ st.write(
115
+ "Choose the tools you want to enable, then ask questions in the chat. "
116
+ "When document chat is enabled, upload files in the sidebar first."
117
+ )
118
+
119
+ with st.sidebar:
120
+ st.header("Settings")
121
+ api_key = os.getenv("GROQ_API_KEY")
122
+ if api_key:
123
+ st.success("Groq API key loaded from .env.")
124
+ else:
125
+ st.warning("GROQ_API_KEY is missing from .env or the environment.")
126
+
127
+ st.header("Tools")
128
+ use_search = st.checkbox("Search", value=True)
129
+ use_wiki = st.checkbox("Wikipedia", value=True)
130
+ use_arxiv = st.checkbox("Arxiv", value=True)
131
+ use_documents = st.checkbox("Uploaded documents", value=False)
132
+
133
+ uploaded_files = []
134
+ if use_documents:
135
+ uploaded_files = st.file_uploader(
136
+ "Add document/s",
137
+ type=["pdf", "txt", "md"],
138
+ accept_multiple_files=True,
139
+ help="Upload PDFs or text files to chat against them.",
140
+ )
141
+
142
+
143
+ current_tool_selection = {
144
+ "search": use_search,
145
+ "wiki": use_wiki,
146
+ "arxiv": use_arxiv,
147
+ "documents": use_documents,
148
+ }
149
+
150
+ if "tool_selection" not in st.session_state:
151
+ st.session_state["tool_selection"] = current_tool_selection
152
+ elif st.session_state["tool_selection"] != current_tool_selection:
153
+ st.session_state["tool_selection"] = current_tool_selection
154
+ st.session_state["messages"] = default_messages()
155
+ st.session_state["chat_memory"] = []
156
+ st.toast("Tool selection changed. Chat was reinitialized.")
157
+
158
+
159
+ if "messages" not in st.session_state:
160
+ st.session_state["messages"] = default_messages()
161
+
162
+ if "chat_memory" not in st.session_state:
163
+ st.session_state["chat_memory"] = []
164
+
165
+ if "document_retriever_signature" not in st.session_state:
166
+ st.session_state["document_retriever_signature"] = None
167
+
168
+ if "document_retriever" not in st.session_state:
169
+ st.session_state["document_retriever"] = None
170
+
171
+
172
+ enabled_tools = []
173
+ if use_search:
174
+ enabled_tools.append(search)
175
+ if use_wiki:
176
+ enabled_tools.append(wiki)
177
+ if use_arxiv:
178
+ enabled_tools.append(arxiv)
179
+
180
+ if use_documents:
181
+ if uploaded_files:
182
+ signature = uploaded_files_signature(uploaded_files)
183
+ if signature != st.session_state["document_retriever_signature"]:
184
+ with st.sidebar.spinner("Indexing uploaded documents..."):
185
+ st.session_state["document_retriever"] = build_retriever(uploaded_files)
186
+ st.session_state["document_retriever_signature"] = signature
187
+
188
+ if st.session_state["document_retriever"]:
189
+ enabled_tools.append(create_documents_tool(st.session_state["document_retriever"]))
190
+ st.sidebar.success("Documents are ready for chat.")
191
+ else:
192
+ st.sidebar.warning("No readable text was found in the uploaded documents.")
193
+ else:
194
+ st.sidebar.info("Upload document/s to enable the document chat tool.")
195
+ else:
196
+ st.session_state["document_retriever"] = None
197
+ st.session_state["document_retriever_signature"] = None
198
+
199
+
200
+ for msg in st.session_state.messages:
201
+ st.chat_message(msg["role"]).write(msg["content"])
202
+
203
+
204
+ if prompt := st.chat_input(placeholder="What is machine learning?"):
205
+ if not api_key:
206
+ st.error("GROQ_API_KEY is missing. Add it to your environment or .env file.")
207
+ st.stop()
208
+
209
+ if not enabled_tools:
210
+ st.error("Select at least one tool from the sidebar before chatting.")
211
+ st.stop()
212
+
213
+ st.session_state.messages.append({"role": "user", "content": prompt})
214
+ st.session_state["chat_memory"].append({"role": "user", "content": prompt})
215
+ st.chat_message("user").write(prompt)
216
+
217
+ llm = ChatGroq(groq_api_key=api_key, model_name="llama-3.1-8b-instant", streaming=True)
218
+ search_agent = create_agent(
219
+ model=llm,
220
+ tools=enabled_tools,
221
+ system_prompt=(
222
+ "You are a helpful assistant. Use only the enabled tools when they are useful. "
223
+ "If uploaded documents are enabled and the user asks about their files, use the "
224
+ "uploaded_documents tool before answering. Provide concise answers and mention "
225
+ "document sources when using uploaded document content."
226
+ ),
227
+ )
228
+
229
+ with st.chat_message("assistant"):
230
+ st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
231
+ result = search_agent.invoke(
232
+ {"messages": st.session_state["chat_memory"]},
233
+ config={"callbacks": [st_cb]},
234
+ )
235
+ response = result["messages"][-1].content
236
+ st.session_state.messages.append({"role": "assistant", "content": response})
237
+ st.session_state["chat_memory"].append({"role": "assistant", "content": response})
238
+ st.write(response)
requirements.txt CHANGED
@@ -1,3 +1,15 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ arxiv==3.0.0
2
+ chromadb==1.5.7
3
+ ddgs==9.13.1
4
+ langchain==1.2.15
5
+ langchain-chroma==1.1.0
6
+ langchain-community==0.4.1
7
+ langchain-core==1.2.30
8
+ langchain-groq==1.1.2
9
+ langchain-huggingface==1.2.1
10
+ langchain-text-splitters==1.1.1
11
+ pypdf==6.10.2
12
+ python-dotenv==1.2.2
13
+ sentence-transformers==5.4.1
14
+ streamlit==1.56.0
15
+ wikipedia==1.4.0
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))