PRSHNTKUMR commited on
Commit
9162cb2
Β·
verified Β·
1 Parent(s): 8ba286f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +46 -52
src/streamlit_app.py CHANGED
@@ -2,7 +2,6 @@ import streamlit as st
2
  import pandas as pd
3
  import json
4
  import io
5
- import os
6
 
7
  from langchain.llms import OpenAI
8
  from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
@@ -21,53 +20,64 @@ _ = load_dotenv(find_dotenv())
21
  # Get API key from Streamlit secrets
22
  API_KEY = os.getenv("OPENAI_API_KEY")
23
 
24
- # Initialize embedding model and vector store in memory (no disk persistence)
25
  embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
26
  vectorstore = Chroma(embedding_function=embeddings_model)
27
 
28
- # Session flags
 
 
 
 
29
  if "agent_created" not in st.session_state:
30
  st.session_state.agent_created = False
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def create_agent(file_content, file_type):
34
- """Create an agent from file content and index the data."""
 
35
  if file_type == "csv":
36
- df = pd.read_csv(io.StringIO(file_content.decode("utf-8")), header=0)
 
37
  elif file_type == "xlsx":
38
- df = pd.read_excel(file_content, header=0)
 
39
  elif file_type == "json":
40
  df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
 
41
  elif file_type in ["pdf", "docx"]:
42
  text = extract_text_from_file(file_content, file_type)
 
43
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
44
  texts = text_splitter.split_text(text)
45
  df = pd.DataFrame({"text": texts})
46
- else:
47
- raise ValueError(f"Unsupported file type: {file_type}")
48
-
49
- # Add text chunks to vectorstore
50
- if file_type in ["pdf", "docx"]:
51
  vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df))
52
-
53
- llm = OpenAI(openai_api_key=API_KEY)
54
- return create_pandas_dataframe_agent(llm, df, verbose=False)
55
-
56
-
57
- def extract_text_from_file(file_content, file_type):
58
- """Extract raw text from supported document formats."""
59
- if file_type == "pdf":
60
- reader = PyPDF2.PdfReader(io.BytesIO(file_content))
61
- return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
62
- elif file_type == "docx":
63
- doc = Document(io.BytesIO(file_content))
64
- return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
65
  else:
66
- return ""
 
67
 
 
 
 
 
 
68
 
69
  def query_agent(query):
70
- """Query the vectorstore using RAG."""
71
  qa_chain = RetrievalQA.from_chain_type(
72
  llm=OpenAI(openai_api_key=API_KEY),
73
  chain_type="stuff",
@@ -76,40 +86,24 @@ def query_agent(query):
76
  result = qa_chain({"query": query})
77
  return result["result"]
78
 
79
-
80
- # --- Streamlit UI ---
81
- st.set_page_config(page_title="RAG from Upload", layout="centered")
82
- st.title("🧠 Chat with Your File")
83
-
84
- uploaded_file = st.file_uploader("Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"])
85
-
86
  if uploaded_file is not None:
87
  st.success(f"βœ… File uploaded: `{uploaded_file.name}` ({uploaded_file.size / 1024:.1f} KB)")
88
-
89
  file_content = uploaded_file.read()
90
  file_type = uploaded_file.name.split(".")[-1]
91
 
92
- st.write("**File type detected:**", file_type.upper())
93
-
94
  if not st.session_state.agent_created:
95
- with st.spinner("Indexing your file..."):
96
  create_agent(file_content, file_type)
97
  st.session_state.agent_created = True
98
- st.success("πŸ“š File successfully processed and indexed. You can now ask your question below.")
99
-
100
 
101
- query = st.text_area("Enter your query")
102
 
103
- if st.button("Submit Query", type="primary"):
104
  if not query.strip():
105
- st.warning("Please enter a valid query.")
106
- st.stop()
107
-
108
- if not st.session_state.agent_created:
109
- create_agent(file_content, file_type)
110
- st.session_state.agent_created = True
111
- st.success("Data loaded and indexed.")
112
-
113
- response = query_agent(query)
114
- st.subheader("πŸ“Œ Answer")
115
- st.write(response)
 
2
  import pandas as pd
3
  import json
4
  import io
 
5
 
6
  from langchain.llms import OpenAI
7
  from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
 
20
  # Get API key from Streamlit secrets
21
  API_KEY = os.getenv("OPENAI_API_KEY")
22
 
23
+ # Initialize Chroma in-memory
24
  embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
25
  vectorstore = Chroma(embedding_function=embeddings_model)
26
 
27
+ # Streamlit UI setup
28
+ st.set_page_config(page_title="RAG File Chat", layout="centered")
29
+ st.title("🧠 Chat with Your Data File")
30
+
31
+ # Session state flag
32
  if "agent_created" not in st.session_state:
33
  st.session_state.agent_created = False
34
 
35
+ # Upload section
36
+ uploaded_file = st.file_uploader("πŸ“ Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"])
37
+
38
+ def extract_text_from_file(file_content, file_type):
39
+ """Extract text from PDF or DOCX."""
40
+ if file_type == "pdf":
41
+ reader = PyPDF2.PdfReader(io.BytesIO(file_content))
42
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
43
+ elif file_type == "docx":
44
+ doc = Document(io.BytesIO(file_content))
45
+ return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
46
+ return ""
47
 
48
  def create_agent(file_content, file_type):
49
+ """Reads, processes, and embeds the file content."""
50
+ # Load file content into DataFrame or raw text
51
  if file_type == "csv":
52
+ df = pd.read_csv(io.StringIO(file_content.decode("utf-8")))
53
+ st.success("πŸ“„ CSV file loaded into DataFrame.")
54
  elif file_type == "xlsx":
55
+ df = pd.read_excel(file_content)
56
+ st.success("πŸ“„ Excel file loaded into DataFrame.")
57
  elif file_type == "json":
58
  df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
59
+ st.success("πŸ“„ JSON file loaded into DataFrame.")
60
  elif file_type in ["pdf", "docx"]:
61
  text = extract_text_from_file(file_content, file_type)
62
+ st.success(f"πŸ“ƒ {file_type.upper()} text extracted.")
63
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
64
  texts = text_splitter.split_text(text)
65
  df = pd.DataFrame({"text": texts})
66
+ st.success("βœ‚οΈ Text split into chunks.")
 
 
 
 
67
  vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df))
68
+ st.success("🧠 Embeddings generated and stored in vector database.")
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
+ st.error("❌ Unsupported file type.")
71
+ return None
72
 
73
+ # Create agent
74
+ llm = OpenAI(openai_api_key=API_KEY)
75
+ agent = create_pandas_dataframe_agent(llm, df, verbose=False)
76
+ st.success("πŸ€– Agent created successfully.")
77
+ return agent
78
 
79
  def query_agent(query):
80
+ """Query vectorstore via RetrievalQA."""
81
  qa_chain = RetrievalQA.from_chain_type(
82
  llm=OpenAI(openai_api_key=API_KEY),
83
  chain_type="stuff",
 
86
  result = qa_chain({"query": query})
87
  return result["result"]
88
 
89
+ # Main Logic
 
 
 
 
 
 
90
  if uploaded_file is not None:
91
  st.success(f"βœ… File uploaded: `{uploaded_file.name}` ({uploaded_file.size / 1024:.1f} KB)")
 
92
  file_content = uploaded_file.read()
93
  file_type = uploaded_file.name.split(".")[-1]
94
 
 
 
95
  if not st.session_state.agent_created:
96
+ with st.spinner("πŸ”„ Processing and indexing the file..."):
97
  create_agent(file_content, file_type)
98
  st.session_state.agent_created = True
 
 
99
 
100
+ query = st.text_area("πŸ’¬ Ask a question based on the file")
101
 
102
+ if st.button("Submit Query"):
103
  if not query.strip():
104
+ st.warning("⚠️ Please enter a query.")
105
+ else:
106
+ with st.spinner("πŸ’‘ Thinking..."):
107
+ answer = query_agent(query)
108
+ st.subheader("πŸ“Œ Answer")
109
+ st.write(answer)