File size: 8,328 Bytes
8b71299
9507dac
 
1179cbe
7950b25
 
 
9507dac
5a9b0ef
2189cf4
 
 
 
 
 
9507dac
070d311
d976d37
 
 
9507dac
2189cf4
 
ff7e11e
 
 
 
 
 
9507dac
070d311
e4c86d7
 
 
 
070d311
2189cf4
f4add34
9507dac
9162cb2
2786a5d
9162cb2
8b71299
c2a4dd2
8b71299
ff7e11e
9162cb2
d976d37
8b71299
ff7e11e
9162cb2
d976d37
8b71299
ff7e11e
 
d976d37
8b71299
ff7e11e
 
9507dac
8b71299
 
ff7e11e
2786a5d
8b71299
 
 
c2a4dd2
8b71299
 
 
 
 
 
 
 
 
 
 
 
 
c2a4dd2
8b71299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff7e11e
9507dac
201de73
2786a5d
0ed5b05
a4eadbd
 
 
 
 
 
 
 
 
 
 
 
b1e3a2b
62465b5
de2e171
 
 
201de73
 
 
0ed5b05
2786a5d
0ed5b05
2786a5d
de2e171
b1e3a2b
de2e171
a4eadbd
 
de2e171
c2a4dd2
 
8b71299
 
 
 
 
 
 
 
de2e171
 
 
0ed5b05
9507dac
0ed5b05
2786a5d
 
0ed5b05
 
 
2786a5d
0ed5b05
c2a4dd2
 
8b71299
 
 
 
 
 
c2a4dd2
 
8b71299
 
0ed5b05
8b71299
0ed5b05
8b71299
0ed5b05
8b71299
0ed5b05
8b71299
0ed5b05
 
 
 
2786a5d
8b71299
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# 🫠 Clean and Final Streamlit RAG App (Three-Agent Architecture)

# --- Environment Setup (Safe for Hugging Face) ---
import os
os.environ["STREAMLIT_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["XDG_DATA_HOME"] = "/tmp"
os.environ["HOME"] = "/tmp"

import asyncio
try:
    asyncio.get_running_loop()
except RuntimeError:
    asyncio.set_event_loop(asyncio.new_event_loop())

# --- Imports ---
import streamlit as st
import pandas as pd
import json
import io
from dotenv import load_dotenv, find_dotenv
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_community.vectorstores import FAISS
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
import PyPDF2
from docx import Document

# --- Load API Key Securely ---
_ = load_dotenv(find_dotenv())
try:
    API_KEY = st.secrets["OPENAI_API_KEY"]
except st.runtime.secrets.StreamlitSecretNotFoundError:
    API_KEY = os.getenv("OPENAI_API_KEY")

embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)

# --- Streamlit Page Setup ---
st.set_page_config(page_title="RAG File Chat", layout="centered")
st.title("🧠 Chat with Your Uploaded File")

# --- Agent 1: File Ingestion and Indexing ---
def agent_alpha_file_uploader(file_content, file_type):
    st.info("πŸ“‚ Agent 1: Loading and indexing your file...")
    if file_type == "csv":
        df = pd.read_csv(io.StringIO(file_content.decode("utf-8")))
        llm = OpenAI(openai_api_key=API_KEY)
        return create_pandas_dataframe_agent(llm, df, verbose=False), None
    elif file_type == "xlsx":
        df = pd.read_excel(file_content)
        llm = OpenAI(openai_api_key=API_KEY)
        return create_pandas_dataframe_agent(llm, df, verbose=False), None
    elif file_type == "json":
        df = pd.DataFrame(json.loads(file_content.decode("utf-8")))
        llm = OpenAI(openai_api_key=API_KEY)
        return create_pandas_dataframe_agent(llm, df, verbose=False), None
    elif file_type in ["pdf", "docx"]:
        text = extract_text_from_file(file_content, file_type)
        chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text)
        vectorstore = FAISS.from_texts(chunks, embeddings_model)
        return None, vectorstore
    else:
        st.error("❌ Unsupported file type.")
        return None, None

# --- Agent 2: Query Resolution ---
def agent_beta_query_processor(query, file_type, df_agent=None, vectorstore=None):
    st.info("🧠 Agent 2: Processing your question...")
    if file_type in ["pdf", "docx"]:
        qa_chain = RetrievalQA.from_chain_type(
            llm=OpenAI(openai_api_key=API_KEY),
            chain_type="stuff",
            retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
        )
        result = qa_chain({"query": query})
        return result["result"]
    else:
        return df_agent.run(query)

# --- Agent 3: Response Enhancement ---
def agent_gamma_response_enhancer(response):
    st.info("πŸ” Agent 3: Reviewing and enhancing the response...")
    enhancement_prompt = f"Improve the clarity and format of the following response:\n{response}"
    llm = OpenAI(openai_api_key=API_KEY)
    return llm.invoke(enhancement_prompt)

# --- Helper Function for Text Extraction ---
def extract_text_from_file(file_content, file_type):
    if file_type == "pdf":
        reader = PyPDF2.PdfReader(io.BytesIO(file_content))
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif file_type == "docx":
        doc = Document(io.BytesIO(file_content))
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    return ""

# --- Session State ---
if "uploaded_file" not in st.session_state:
    st.session_state.uploaded_file = None
if "file_uploaded" not in st.session_state:
    st.session_state.file_uploaded = False
if "vectorstore" not in st.session_state:
    st.session_state.vectorstore = None
if "agent" not in st.session_state:
    st.session_state.agent = None
if "file_type" not in st.session_state:
    st.session_state.file_type = None

# --- File Upload UI ---
MAX_SIZE_MB = 50
uploaded = st.file_uploader("πŸ“ Browse and select a file", type=["csv", "xlsx", "json", "pdf", "docx"])
if uploaded:
    st.write("🧾 Debug Upload Log:")
    st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes")
    st.write("πŸ“₯ Upload Raw Details (client-side and browser environment check):")
    st.json({
        "user_agent": st.experimental_get_query_params().get('user-agent', ['N/A'])[0],
        "streamlit_version": st.__version__,
        "name": uploaded.name,
        "type": uploaded.type,
        "size_bytes": uploaded.size,
        "is_excel": uploaded.name.endswith(('.xls', '.xlsx')),
        "is_binary": isinstance(uploaded, io.BytesIO),
    })
    st.write("🧾 Debug Upload Log:")
    st.code(f"Filename: {uploaded.name} | Type: {uploaded.type} | Size: {uploaded.size} bytes")
    st.write("πŸ“ File metadata:")
    st.json({"name": uploaded.name, "size_kb": uploaded.size / 1024, "type": uploaded.type})

    if uploaded.size > MAX_SIZE_MB * 1024 * 1024:
        st.error(f"❌ File too large. Maximum allowed size is {MAX_SIZE_MB}MB.")
        st.stop()
    st.session_state.uploaded_file = uploaded
    st.info(f"βœ… File selected: `{uploaded.name}` ({uploaded.size / 1024:.1f} KB)")

if st.session_state.uploaded_file and st.button("πŸ“€ Upload File"):
    try:
        st.write("πŸ“‘ Reading uploaded file content...")
        content = st.session_state.uploaded_file.read()
        st.write("πŸ§ͺ File read complete. Size:", len(content), "bytes")
        st.code(str(content[:200]) + "..." if isinstance(content, bytes) else content[:500])
        ftype = st.session_state.uploaded_file.name.split(".")[-1].lower()
        with st.spinner("πŸ”„ Agent Alpha (Uploader): Processing and indexing file..."):
            agent, vectorstore = agent_alpha_file_uploader(content, ftype)
            if agent or vectorstore:
                st.session_state.agent = agent
                st.session_state.vectorstore = vectorstore
                st.session_state.file_uploaded = True
                st.session_state.file_type = ftype
                st.success("βœ… File processed successfully.")
            else:
                st.error("⚠️ Failed to process file.")
    except Exception as e:
        st.error("❌ Upload failed. Try a smaller file or check connection.")
        st.exception(e)

# --- Query UI ---
if st.session_state.file_uploaded:
    output_format = st.selectbox("πŸ“‹ Select Output Format", ["Plain Text", "Markdown", "Tabular View"])
    query = st.text_area("πŸ” Ask a question about your uploaded file")

    if st.button("Submit Query"):
        if not query.strip():
            st.warning("⚠️ Please enter a valid question.")
        else:
            with st.spinner("πŸ’‘ Agent Beta (Processor): Handling your query..."):
                raw_response = agent_beta_query_processor(
                    query,
                    st.session_state.file_type,
                    df_agent=st.session_state.agent,
                    vectorstore=st.session_state.vectorstore,
                )

            with st.spinner("✨ Agent Gamma (Reviewer): Enhancing the response..."):
                enhanced_response = agent_gamma_response_enhancer(raw_response)

            st.subheader("πŸ“Œ Final Answer")
            if output_format == "Plain Text":
                st.text(enhanced_response)
            elif output_format == "Markdown":
                st.markdown(enhanced_response)
            elif output_format == "Tabular View":
                rows = [line.split("\t") for line in enhanced_response.split("\n") if "\t" in line]
                if not rows or len(rows[0]) == 1:
                    rows = [line.split(",") for line in enhanced_response.split("\n") if "," in line]
                try:
                    df = pd.DataFrame(rows[1:], columns=rows[0])
                    st.dataframe(df)
                except Exception:
                    st.warning("⚠️ Could not render table. Showing raw text.")
                    st.text(enhanced_response)