Spaces:
Build error
Build error
File size: 8,328 Bytes
8b71299 9507dac 1179cbe 7950b25 9507dac 5a9b0ef 2189cf4 9507dac 070d311 d976d37 9507dac 2189cf4 ff7e11e 9507dac 070d311 e4c86d7 070d311 2189cf4 f4add34 9507dac 9162cb2 2786a5d 9162cb2 8b71299 c2a4dd2 8b71299 ff7e11e 9162cb2 d976d37 8b71299 ff7e11e 9162cb2 d976d37 8b71299 ff7e11e d976d37 8b71299 ff7e11e 9507dac 8b71299 ff7e11e 2786a5d 8b71299 c2a4dd2 8b71299 c2a4dd2 8b71299 ff7e11e 9507dac 201de73 2786a5d 0ed5b05 a4eadbd b1e3a2b 62465b5 de2e171 201de73 0ed5b05 2786a5d 0ed5b05 2786a5d de2e171 b1e3a2b de2e171 a4eadbd de2e171 c2a4dd2 8b71299 de2e171 0ed5b05 9507dac 0ed5b05 2786a5d 0ed5b05 2786a5d 0ed5b05 c2a4dd2 8b71299 c2a4dd2 8b71299 0ed5b05 8b71299 0ed5b05 8b71299 0ed5b05 8b71299 0ed5b05 8b71299 0ed5b05 2786a5d 8b71299 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# π« Clean and Final Streamlit RAG App (Three-Agent Architecture)
# --- Environment Setup (Safe for Hugging Face) ---
import os
os.environ["STREAMLIT_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["XDG_DATA_HOME"] = "/tmp"
os.environ["HOME"] = "/tmp"
import asyncio
try:
asyncio.get_running_loop()
except RuntimeError:
asyncio.set_event_loop(asyncio.new_event_loop())
# --- Imports ---
import streamlit as st
import pandas as pd
import json
import io
from dotenv import load_dotenv, find_dotenv
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_community.vectorstores import FAISS
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
import PyPDF2
from docx import Document
# --- Load API Key Securely ---
_ = load_dotenv(find_dotenv())
try:
API_KEY = st.secrets["OPENAI_API_KEY"]
except st.runtime.secrets.StreamlitSecretNotFoundError:
API_KEY = os.getenv("OPENAI_API_KEY")
embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
# --- Streamlit Page Setup ---
st.set_page_config(page_title="RAG File Chat", layout="centered")
st.title("π§ Chat with Your Uploaded File")
# --- Agent 1: File Ingestion and Indexing ---
def agent_alpha_file_uploader(file_content, file_type):
st.info("π Agent 1: Loading and indexing your file...")
if file_type == "csv":
df = pd.read_csv(io.StringIO(file_content.decode("utf-8")))
llm = OpenAI(openai_api_key=API_KEY)
return create_pandas_dataframe_agent(llm, df, verbose=False), None
elif file_type == "xlsx":
df = pd.read_excel(file_content)
llm = OpenAI(openai_api_key=API_KEY)
return create_pandas_dataframe_agent(llm, df, verbose=False), None
elif file_type == "json":
df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
llm = OpenAI(openai_api_key=API_KEY)
return create_pandas_dataframe_agent(llm, df, verbose=False), None
elif file_type in ["pdf", "docx"]:
text = extract_text_from_file(file_content, file_type)
chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text)
vectorstore = FAISS.from_texts(chunks, embeddings_model)
return None, vectorstore
else:
st.error("β Unsupported file type.")
return None, None
# --- Agent 2: Query Resolution ---
def agent_beta_query_processor(query, file_type, df_agent=None, vectorstore=None):
st.info("π§ Agent 2: Processing your question...")
if file_type in ["pdf", "docx"]:
qa_chain = RetrievalQA.from_chain_type(
llm=OpenAI(openai_api_key=API_KEY),
chain_type="stuff",
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
)
result = qa_chain({"query": query})
return result["result"]
else:
return df_agent.run(query)
# --- Agent 3: Response Enhancement ---
def agent_gamma_response_enhancer(response):
st.info("π Agent 3: Reviewing and enhancing the response...")
enhancement_prompt = f"Improve the clarity and format of the following response:\n{response}"
llm = OpenAI(openai_api_key=API_KEY)
return llm.invoke(enhancement_prompt)
# --- Helper Function for Text Extraction ---
def extract_text_from_file(file_content, file_type):
if file_type == "pdf":
reader = PyPDF2.PdfReader(io.BytesIO(file_content))
return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
elif file_type == "docx":
doc = Document(io.BytesIO(file_content))
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
return ""
# --- Session State ---
if "uploaded_file" not in st.session_state:
st.session_state.uploaded_file = None
if "file_uploaded" not in st.session_state:
st.session_state.file_uploaded = False
if "vectorstore" not in st.session_state:
st.session_state.vectorstore = None
if "agent" not in st.session_state:
st.session_state.agent = None
if "file_type" not in st.session_state:
st.session_state.file_type = None
# --- File Upload UI ---
MAX_SIZE_MB = 50
uploaded = st.file_uploader("π Browse and select a file", type=["csv", "xlsx", "json", "pdf", "docx"])
if uploaded:
st.write("π§Ύ Debug Upload Log:")
st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes")
st.write("π₯ Upload Raw Details (client-side and browser environment check):")
st.json({
"user_agent": st.experimental_get_query_params().get('user-agent', ['N/A'])[0],
"streamlit_version": st.__version__,
"name": uploaded.name,
"type": uploaded.type,
"size_bytes": uploaded.size,
"is_excel": uploaded.name.endswith(('.xls', '.xlsx')),
"is_binary": isinstance(uploaded, io.BytesIO),
})
st.write("π§Ύ Debug Upload Log:")
st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes")
st.write("π File metadata:")
st.json({"name": uploaded.name, "size_kb": uploaded.size / 1024, "type": uploaded.type})
if uploaded.size > MAX_SIZE_MB * 1024 * 1024:
st.error(f"β File too large. Maximum allowed size is {MAX_SIZE_MB}MB.")
st.stop()
st.session_state.uploaded_file = uploaded
st.info(f"β
File selected: `{uploaded.name}` ({uploaded.size / 1024:.1f} KB)")
if st.session_state.uploaded_file and st.button("π€ Upload File"):
try:
st.write("π‘ Reading uploaded file content...")
content = st.session_state.uploaded_file.read()
st.write("π§ͺ File read complete. Size:", len(content), "bytes")
st.code(str(content[:200]) + "..." if isinstance(content, bytes) else content[:500])
ftype = st.session_state.uploaded_file.name.split(".")[-1].lower()
with st.spinner("π Agent Alpha (Uploader): Processing and indexing file..."):
agent, vectorstore = agent_alpha_file_uploader(content, ftype)
if agent or vectorstore:
st.session_state.agent = agent
st.session_state.vectorstore = vectorstore
st.session_state.file_uploaded = True
st.session_state.file_type = ftype
st.success("β
File processed successfully.")
else:
st.error("β οΈ Failed to process file.")
except Exception as e:
st.error("β Upload failed. Try a smaller file or check connection.")
st.exception(e)
# --- Query UI ---
if st.session_state.file_uploaded:
output_format = st.selectbox("π Select Output Format", ["Plain Text", "Markdown", "Tabular View"])
query = st.text_area("π Ask a question about your uploaded file")
if st.button("Submit Query"):
if not query.strip():
st.warning("β οΈ Please enter a valid question.")
else:
with st.spinner("π‘ Agent Beta (Processor): Handling your query..."):
raw_response = agent_beta_query_processor(
query,
st.session_state.file_type,
df_agent=st.session_state.agent,
vectorstore=st.session_state.vectorstore,
)
with st.spinner("β¨ Agent Gamma (Reviewer): Enhancing the response..."):
enhanced_response = agent_gamma_response_enhancer(raw_response)
st.subheader("π Final Answer")
if output_format == "Plain Text":
st.text(enhanced_response)
elif output_format == "Markdown":
st.markdown(enhanced_response)
elif output_format == "Tabular View":
rows = [line.split("\t") for line in enhanced_response.split("\n") if "\t" in line]
if not rows or len(rows[0]) == 1:
rows = [line.split(",") for line in enhanced_response.split("\n") if "," in line]
try:
df = pd.DataFrame(rows[1:], columns=rows[0])
st.dataframe(df)
except Exception:
st.warning("β οΈ Could not render table. Showing raw text.")
st.text(enhanced_response)
|