Spaces:
Build error
Build error
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +46 -52
src/streamlit_app.py
CHANGED
|
@@ -2,7 +2,6 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import json
|
| 4 |
import io
|
| 5 |
-
import os
|
| 6 |
|
| 7 |
from langchain.llms import OpenAI
|
| 8 |
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
|
|
@@ -21,53 +20,64 @@ _ = load_dotenv(find_dotenv())
|
|
| 21 |
# Get API key from Streamlit secrets
|
| 22 |
API_KEY = os.getenv("OPENAI_API_KEY")
|
| 23 |
|
| 24 |
-
# Initialize
|
| 25 |
embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
|
| 26 |
vectorstore = Chroma(embedding_function=embeddings_model)
|
| 27 |
|
| 28 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if "agent_created" not in st.session_state:
|
| 30 |
st.session_state.agent_created = False
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def create_agent(file_content, file_type):
|
| 34 |
-
"""
|
|
|
|
| 35 |
if file_type == "csv":
|
| 36 |
-
df = pd.read_csv(io.StringIO(file_content.decode("utf-8"))
|
|
|
|
| 37 |
elif file_type == "xlsx":
|
| 38 |
-
df = pd.read_excel(file_content
|
|
|
|
| 39 |
elif file_type == "json":
|
| 40 |
df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
|
|
|
|
| 41 |
elif file_type in ["pdf", "docx"]:
|
| 42 |
text = extract_text_from_file(file_content, file_type)
|
|
|
|
| 43 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
| 44 |
texts = text_splitter.split_text(text)
|
| 45 |
df = pd.DataFrame({"text": texts})
|
| 46 |
-
|
| 47 |
-
raise ValueError(f"Unsupported file type: {file_type}")
|
| 48 |
-
|
| 49 |
-
# Add text chunks to vectorstore
|
| 50 |
-
if file_type in ["pdf", "docx"]:
|
| 51 |
vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df))
|
| 52 |
-
|
| 53 |
-
llm = OpenAI(openai_api_key=API_KEY)
|
| 54 |
-
return create_pandas_dataframe_agent(llm, df, verbose=False)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def extract_text_from_file(file_content, file_type):
|
| 58 |
-
"""Extract raw text from supported document formats."""
|
| 59 |
-
if file_type == "pdf":
|
| 60 |
-
reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
| 61 |
-
return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
| 62 |
-
elif file_type == "docx":
|
| 63 |
-
doc = Document(io.BytesIO(file_content))
|
| 64 |
-
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
| 65 |
else:
|
| 66 |
-
|
|
|
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def query_agent(query):
|
| 70 |
-
"""Query
|
| 71 |
qa_chain = RetrievalQA.from_chain_type(
|
| 72 |
llm=OpenAI(openai_api_key=API_KEY),
|
| 73 |
chain_type="stuff",
|
|
@@ -76,40 +86,24 @@ def query_agent(query):
|
|
| 76 |
result = qa_chain({"query": query})
|
| 77 |
return result["result"]
|
| 78 |
|
| 79 |
-
|
| 80 |
-
# --- Streamlit UI ---
|
| 81 |
-
st.set_page_config(page_title="RAG from Upload", layout="centered")
|
| 82 |
-
st.title("π§ Chat with Your File")
|
| 83 |
-
|
| 84 |
-
uploaded_file = st.file_uploader("Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"])
|
| 85 |
-
|
| 86 |
if uploaded_file is not None:
|
| 87 |
st.success(f"β
File uploaded: `{uploaded_file.name}` ({uploaded_file.size / 1024:.1f} KB)")
|
| 88 |
-
|
| 89 |
file_content = uploaded_file.read()
|
| 90 |
file_type = uploaded_file.name.split(".")[-1]
|
| 91 |
|
| 92 |
-
st.write("**File type detected:**", file_type.upper())
|
| 93 |
-
|
| 94 |
if not st.session_state.agent_created:
|
| 95 |
-
with st.spinner("
|
| 96 |
create_agent(file_content, file_type)
|
| 97 |
st.session_state.agent_created = True
|
| 98 |
-
st.success("π File successfully processed and indexed. You can now ask your question below.")
|
| 99 |
-
|
| 100 |
|
| 101 |
-
query = st.text_area("
|
| 102 |
|
| 103 |
-
if st.button("Submit Query"
|
| 104 |
if not query.strip():
|
| 105 |
-
st.warning("Please enter a
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
st.
|
| 111 |
-
st.success("Data loaded and indexed.")
|
| 112 |
-
|
| 113 |
-
response = query_agent(query)
|
| 114 |
-
st.subheader("π Answer")
|
| 115 |
-
st.write(response)
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import json
|
| 4 |
import io
|
|
|
|
| 5 |
|
| 6 |
from langchain.llms import OpenAI
|
| 7 |
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
|
|
|
|
| 20 |
# Get API key from Streamlit secrets
|
| 21 |
API_KEY = os.getenv("OPENAI_API_KEY")
|
| 22 |
|
| 23 |
+
# Initialize Chroma in-memory
|
| 24 |
embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
|
| 25 |
vectorstore = Chroma(embedding_function=embeddings_model)
|
| 26 |
|
| 27 |
+
# Streamlit UI setup
|
| 28 |
+
st.set_page_config(page_title="RAG File Chat", layout="centered")
|
| 29 |
+
st.title("π§ Chat with Your Data File")
|
| 30 |
+
|
| 31 |
+
# Session state flag
|
| 32 |
if "agent_created" not in st.session_state:
|
| 33 |
st.session_state.agent_created = False
|
| 34 |
|
| 35 |
+
# Upload section
|
| 36 |
+
uploaded_file = st.file_uploader("π Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"])
|
| 37 |
+
|
| 38 |
+
def extract_text_from_file(file_content, file_type):
|
| 39 |
+
"""Extract text from PDF or DOCX."""
|
| 40 |
+
if file_type == "pdf":
|
| 41 |
+
reader = PyPDF2.PdfReader(io.BytesIO(file_content))
|
| 42 |
+
return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
| 43 |
+
elif file_type == "docx":
|
| 44 |
+
doc = Document(io.BytesIO(file_content))
|
| 45 |
+
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
|
| 46 |
+
return ""
|
| 47 |
|
| 48 |
def create_agent(file_content, file_type):
|
| 49 |
+
"""Reads, processes, and embeds the file content."""
|
| 50 |
+
# Load file content into DataFrame or raw text
|
| 51 |
if file_type == "csv":
|
| 52 |
+
df = pd.read_csv(io.StringIO(file_content.decode("utf-8")))
|
| 53 |
+
st.success("π CSV file loaded into DataFrame.")
|
| 54 |
elif file_type == "xlsx":
|
| 55 |
+
df = pd.read_excel(file_content)
|
| 56 |
+
st.success("π Excel file loaded into DataFrame.")
|
| 57 |
elif file_type == "json":
|
| 58 |
df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
|
| 59 |
+
st.success("π JSON file loaded into DataFrame.")
|
| 60 |
elif file_type in ["pdf", "docx"]:
|
| 61 |
text = extract_text_from_file(file_content, file_type)
|
| 62 |
+
st.success(f"π {file_type.upper()} text extracted.")
|
| 63 |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
| 64 |
texts = text_splitter.split_text(text)
|
| 65 |
df = pd.DataFrame({"text": texts})
|
| 66 |
+
st.success("βοΈ Text split into chunks.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df))
|
| 68 |
+
st.success("π§ Embeddings generated and stored in vector database.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
else:
|
| 70 |
+
st.error("β Unsupported file type.")
|
| 71 |
+
return None
|
| 72 |
|
| 73 |
+
# Create agent
|
| 74 |
+
llm = OpenAI(openai_api_key=API_KEY)
|
| 75 |
+
agent = create_pandas_dataframe_agent(llm, df, verbose=False)
|
| 76 |
+
st.success("π€ Agent created successfully.")
|
| 77 |
+
return agent
|
| 78 |
|
| 79 |
def query_agent(query):
|
| 80 |
+
"""Query vectorstore via RetrievalQA."""
|
| 81 |
qa_chain = RetrievalQA.from_chain_type(
|
| 82 |
llm=OpenAI(openai_api_key=API_KEY),
|
| 83 |
chain_type="stuff",
|
|
|
|
| 86 |
result = qa_chain({"query": query})
|
| 87 |
return result["result"]
|
| 88 |
|
| 89 |
+
# Main Logic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
if uploaded_file is not None:
|
| 91 |
st.success(f"β
File uploaded: `{uploaded_file.name}` ({uploaded_file.size / 1024:.1f} KB)")
|
|
|
|
| 92 |
file_content = uploaded_file.read()
|
| 93 |
file_type = uploaded_file.name.split(".")[-1]
|
| 94 |
|
|
|
|
|
|
|
| 95 |
if not st.session_state.agent_created:
|
| 96 |
+
with st.spinner("π Processing and indexing the file..."):
|
| 97 |
create_agent(file_content, file_type)
|
| 98 |
st.session_state.agent_created = True
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
query = st.text_area("π¬ Ask a question based on the file")
|
| 101 |
|
| 102 |
+
if st.button("Submit Query"):
|
| 103 |
if not query.strip():
|
| 104 |
+
st.warning("β οΈ Please enter a query.")
|
| 105 |
+
else:
|
| 106 |
+
with st.spinner("π‘ Thinking..."):
|
| 107 |
+
answer = query_agent(query)
|
| 108 |
+
st.subheader("π Answer")
|
| 109 |
+
st.write(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|