Spaces:
Build error
Build error
| # π« Clean and Final Streamlit RAG App (Three-Agent Architecture) | |
| # --- Environment Setup (Safe for Hugging Face) --- | |
| import os | |
| os.environ["STREAMLIT_HOME"] = "/tmp" | |
| os.environ["XDG_CONFIG_HOME"] = "/tmp" | |
| os.environ["XDG_DATA_HOME"] = "/tmp" | |
| os.environ["HOME"] = "/tmp" | |
| import asyncio | |
| try: | |
| asyncio.get_running_loop() | |
| except RuntimeError: | |
| asyncio.set_event_loop(asyncio.new_event_loop()) | |
| # --- Imports --- | |
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| import io | |
| from dotenv import load_dotenv, find_dotenv | |
| from langchain_openai import OpenAIEmbeddings, OpenAI | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| import PyPDF2 | |
| from docx import Document | |
| # --- Load API Key Securely --- | |
| _ = load_dotenv(find_dotenv()) | |
| try: | |
| API_KEY = st.secrets["OPENAI_API_KEY"] | |
| except st.runtime.secrets.StreamlitSecretNotFoundError: | |
| API_KEY = os.getenv("OPENAI_API_KEY") | |
| embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY) | |
| # --- Streamlit Page Setup --- | |
| st.set_page_config(page_title="RAG File Chat", layout="centered") | |
| st.title("π§ Chat with Your Uploaded File") | |
| # --- Agent 1: File Ingestion and Indexing --- | |
| def agent_alpha_file_uploader(file_content, file_type): | |
| st.info("π Agent 1: Loading and indexing your file...") | |
| if file_type == "csv": | |
| df = pd.read_csv(io.StringIO(file_content.decode("utf-8"))) | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| return create_pandas_dataframe_agent(llm, df, verbose=False), None | |
| elif file_type == "xlsx": | |
| df = pd.read_excel(file_content) | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| return create_pandas_dataframe_agent(llm, df, verbose=False), None | |
| elif file_type == "json": | |
| df = pd.DataFrame(json.loads(file_content.decode("utf-8"))) | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| return create_pandas_dataframe_agent(llm, df, verbose=False), None | |
| elif file_type in ["pdf", "docx"]: | |
| text = extract_text_from_file(file_content, file_type) | |
| chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text) | |
| vectorstore = FAISS.from_texts(chunks, embeddings_model) | |
| return None, vectorstore | |
| else: | |
| st.error("β Unsupported file type.") | |
| return None, None | |
| # --- Agent 2: Query Resolution --- | |
| def agent_beta_query_processor(query, file_type, df_agent=None, vectorstore=None): | |
| st.info("π§ Agent 2: Processing your question...") | |
| if file_type in ["pdf", "docx"]: | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=OpenAI(openai_api_key=API_KEY), | |
| chain_type="stuff", | |
| retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), | |
| ) | |
| result = qa_chain({"query": query}) | |
| return result["result"] | |
| else: | |
| return df_agent.run(query) | |
| # --- Agent 3: Response Enhancement --- | |
| def agent_gamma_response_enhancer(response): | |
| st.info("π Agent 3: Reviewing and enhancing the response...") | |
| enhancement_prompt = f"Improve the clarity and format of the following response:\n{response}" | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| return llm.invoke(enhancement_prompt) | |
| # --- Helper Function for Text Extraction --- | |
| def extract_text_from_file(file_content, file_type): | |
| if file_type == "pdf": | |
| reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
| return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
| elif file_type == "docx": | |
| doc = Document(io.BytesIO(file_content)) | |
| return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| return "" | |
| # --- Session State --- | |
| if "uploaded_file" not in st.session_state: | |
| st.session_state.uploaded_file = None | |
| if "file_uploaded" not in st.session_state: | |
| st.session_state.file_uploaded = False | |
| if "vectorstore" not in st.session_state: | |
| st.session_state.vectorstore = None | |
| if "agent" not in st.session_state: | |
| st.session_state.agent = None | |
| if "file_type" not in st.session_state: | |
| st.session_state.file_type = None | |
| # --- File Upload UI --- | |
| MAX_SIZE_MB = 50 | |
| uploaded = st.file_uploader("π Browse and select a file", type=["csv", "xlsx", "json", "pdf", "docx"]) | |
| if uploaded: | |
| st.write("π§Ύ Debug Upload Log:") | |
| st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes") | |
| st.write("π₯ Upload Raw Details (client-side and browser environment check):") | |
| st.json({ | |
| "user_agent": st.experimental_get_query_params().get('user-agent', ['N/A'])[0], | |
| "streamlit_version": st.__version__, | |
| "name": uploaded.name, | |
| "type": uploaded.type, | |
| "size_bytes": uploaded.size, | |
| "is_excel": uploaded.name.endswith(('.xls', '.xlsx')), | |
| "is_binary": isinstance(uploaded, io.BytesIO), | |
| }) | |
| st.write("π§Ύ Debug Upload Log:") | |
| st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes") | |
| st.write("π File metadata:") | |
| st.json({"name": uploaded.name, "size_kb": uploaded.size / 1024, "type": uploaded.type}) | |
| if uploaded.size > MAX_SIZE_MB * 1024 * 1024: | |
| st.error(f"β File too large. Maximum allowed size is {MAX_SIZE_MB}MB.") | |
| st.stop() | |
| st.session_state.uploaded_file = uploaded | |
| st.info(f"β File selected: `{uploaded.name}` ({uploaded.size / 1024:.1f} KB)") | |
| if st.session_state.uploaded_file and st.button("π€ Upload File"): | |
| try: | |
| st.write("π‘ Reading uploaded file content...") | |
| content = st.session_state.uploaded_file.read() | |
| st.write("π§ͺ File read complete. Size:", len(content), "bytes") | |
| st.code(str(content[:200]) + "..." if isinstance(content, bytes) else content[:500]) | |
| ftype = st.session_state.uploaded_file.name.split(".")[-1].lower() | |
| with st.spinner("π Agent Alpha (Uploader): Processing and indexing file..."): | |
| agent, vectorstore = agent_alpha_file_uploader(content, ftype) | |
| if agent or vectorstore: | |
| st.session_state.agent = agent | |
| st.session_state.vectorstore = vectorstore | |
| st.session_state.file_uploaded = True | |
| st.session_state.file_type = ftype | |
| st.success("β File processed successfully.") | |
| else: | |
| st.error("β οΈ Failed to process file.") | |
| except Exception as e: | |
| st.error("β Upload failed. Try a smaller file or check connection.") | |
| st.exception(e) | |
| # --- Query UI --- | |
| if st.session_state.file_uploaded: | |
| output_format = st.selectbox("π Select Output Format", ["Plain Text", "Markdown", "Tabular View"]) | |
| query = st.text_area("π Ask a question about your uploaded file") | |
| if st.button("Submit Query"): | |
| if not query.strip(): | |
| st.warning("β οΈ Please enter a valid question.") | |
| else: | |
| with st.spinner("π‘ Agent Beta (Processor): Handling your query..."): | |
| raw_response = agent_beta_query_processor( | |
| query, | |
| st.session_state.file_type, | |
| df_agent=st.session_state.agent, | |
| vectorstore=st.session_state.vectorstore, | |
| ) | |
| with st.spinner("β¨ Agent Gamma (Reviewer): Enhancing the response..."): | |
| enhanced_response = agent_gamma_response_enhancer(raw_response) | |
| st.subheader("π Final Answer") | |
| if output_format == "Plain Text": | |
| st.text(enhanced_response) | |
| elif output_format == "Markdown": | |
| st.markdown(enhanced_response) | |
| elif output_format == "Tabular View": | |
| rows = [line.split("\t") for line in enhanced_response.split("\n") if "\t" in line] | |
| if not rows or len(rows[0]) == 1: | |
| rows = [line.split(",") for line in enhanced_response.split("\n") if "," in line] | |
| try: | |
| df = pd.DataFrame(rows[1:], columns=rows[0]) | |
| st.dataframe(df) | |
| except Exception: | |
| st.warning("β οΈ Could not render table. Showing raw text.") | |
| st.text(enhanced_response) | |