PRSHNTKUMR commited on
Commit
ff7e11e
·
verified ·
1 Parent(s): 329e671

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +102 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,104 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ import io
5
+ import os
6
+
7
+ from langchain.llms import OpenAI
8
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.chains import RetrievalQA
13
+
14
+ import PyPDF2
15
+ from docx import Document
16
+
17
+ from dotenv import load_dotenv, find_dotenv
18
+
19
+ _ = load_dotenv(find_dotenv())
20
+
21
+ # Get API key from Streamlit secrets
22
+ API_KEY = os.getenv("OPENAI_API_KEY")
23
+
24
+ # Initialize embedding model and vector store in memory (no disk persistence)
25
+ embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
26
+ vectorstore = Chroma(embedding_function=embeddings_model)
27
+
28
+ # Session flags
29
+ if "agent_created" not in st.session_state:
30
+ st.session_state.agent_created = False
31
+
32
+
33
+ def create_agent(file_content, file_type):
34
+ """Create an agent from file content and index the data."""
35
+ if file_type == "csv":
36
+ df = pd.read_csv(io.StringIO(file_content.decode("utf-8")), header=0)
37
+ elif file_type == "xlsx":
38
+ df = pd.read_excel(file_content, header=0)
39
+ elif file_type == "json":
40
+ df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
41
+ elif file_type in ["pdf", "docx"]:
42
+ text = extract_text_from_file(file_content, file_type)
43
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
44
+ texts = text_splitter.split_text(text)
45
+ df = pd.DataFrame({"text": texts})
46
+ else:
47
+ raise ValueError(f"Unsupported file type: {file_type}")
48
+
49
+ # Add text chunks to vectorstore
50
+ if file_type in ["pdf", "docx"]:
51
+ vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df))
52
+
53
+ llm = OpenAI(openai_api_key=API_KEY)
54
+ return create_pandas_dataframe_agent(llm, df, verbose=False)
55
+
56
+
57
+ def extract_text_from_file(file_content, file_type):
58
+ """Extract raw text from supported document formats."""
59
+ if file_type == "pdf":
60
+ reader = PyPDF2.PdfReader(io.BytesIO(file_content))
61
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
62
+ elif file_type == "docx":
63
+ doc = Document(io.BytesIO(file_content))
64
+ return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
65
+ else:
66
+ return ""
67
+
68
+
69
+ def query_agent(query):
70
+ """Query the vectorstore using RAG."""
71
+ qa_chain = RetrievalQA.from_chain_type(
72
+ llm=OpenAI(openai_api_key=API_KEY),
73
+ chain_type="stuff",
74
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
75
+ )
76
+ result = qa_chain({"query": query})
77
+ return result["result"]
78
+
79
+
80
+ # --- Streamlit UI ---
81
+ st.set_page_config(page_title="RAG from Upload", layout="centered")
82
+ st.title("🧠 Chat with Your File")
83
+
84
+ uploaded_file = st.file_uploader("Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"])
85
+
86
+ if uploaded_file is not None:
87
+ file_content = uploaded_file.read()
88
+ file_type = uploaded_file.name.split(".")[-1]
89
+
90
+ query = st.text_area("Enter your query")
91
+
92
+ if st.button("Submit Query", type="primary"):
93
+ if not query.strip():
94
+ st.warning("Please enter a valid query.")
95
+ st.stop()
96
+
97
+ if not st.session_state.agent_created:
98
+ create_agent(file_content, file_type)
99
+ st.session_state.agent_created = True
100
+ st.success("Data loaded and indexed.")
101
 
102
+ response = query_agent(query)
103
+ st.subheader("📌 Answer")
104
+ st.write(response)