gradio_RAG_CHAT / app.py
PRSHNTKUMR's picture
Update app.py
6273e84 verified
# 🧠 Gradio RAG App for Hugging Face Deployment (Multi-File Support)
# --- Imports ---
import os
import io
import json
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI as OpenAI
from langchain_community.vectorstores import FAISS
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
import PyPDF2
from docx import Document
# --- Load API Key ---
_ = load_dotenv(find_dotenv())
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
raise ValueError("❌ OPENAI_API_KEY is missing. Please add it to .env or set as environment variable.")
embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
PERSIST_DIR = "persist_data"
os.makedirs(PERSIST_DIR, exist_ok=True)
TEXT_STORE = os.path.join(PERSIST_DIR, "text_chunks.json")
FILE_LOG = os.path.join(PERSIST_DIR, "uploaded_files.json")
# --- Agent 1: File Ingestion and Indexing ---
def agent_alpha_file_uploader(file_obj, file_type, filename):
if not os.path.exists(FILE_LOG):
with open(FILE_LOG, "w") as f:
json.dump([], f)
with open(FILE_LOG, "r+") as f:
files = json.load(f)
if filename not in files:
files.append(filename)
f.seek(0)
json.dump(files, f)
f.truncate()
if file_type in ["csv", "xlsx", "json"]:
if file_type == "csv":
df = pd.read_csv(file_obj)
elif file_type == "xlsx":
import openpyxl
df = pd.read_excel(file_obj, engine="openpyxl", header=0)
elif file_type == "json":
df = pd.DataFrame(json.load(file_obj))
df.to_csv(os.path.join(PERSIST_DIR, filename + ".csv"), index=False)
return df, None, f"βœ… {filename} stored for tabular querying."
elif file_type in ["pdf", "docx"]:
text = extract_text_from_file(file_obj, file_type)
chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text)
if os.path.exists(TEXT_STORE):
with open(TEXT_STORE, "r") as f:
all_chunks = json.load(f)
else:
all_chunks = []
all_chunks.extend(chunks)
with open(TEXT_STORE, "w") as f:
json.dump(all_chunks, f)
vectorstore = FAISS.from_texts(all_chunks, embeddings_model)
vectorstore.save_local(PERSIST_DIR)
return None, vectorstore, f"βœ… {filename} embedded into vectorstore."
else:
raise ValueError("Unsupported file type.")
# --- Agent 2: Query Resolution ---
def agent_theta_tabular():
combined_df = []
markdown_blocks = []
for file in os.listdir(PERSIST_DIR):
if file.endswith(".csv"):
try:
df = pd.read_csv(os.path.join(PERSIST_DIR, file))
combined_df.append(df)
markdown_blocks.append(f"### File: {file}\n" + df.head(5).to_markdown(index=False))
except Exception as e:
print(f"❌ Failed to load {file}: {e}")
tabular_md = "\n\n".join(markdown_blocks)
return pd.concat(combined_df, ignore_index=True) if combined_df else None, tabular_md
def agent_beta_query_processor(query):
if not os.path.exists(FILE_LOG) and not os.path.exists(TEXT_STORE):
llm = OpenAI(openai_api_key=API_KEY)
try:
return llm.invoke(query).content
except Exception as e:
return f"❌ LLM Chat Failed: {e}"
df_debug = ""
document_context = []
uploaded_files = []
combined_df, tabular_md = agent_theta_tabular()
if combined_df is None or combined_df.empty:
df_debug = "⚠️ No rows found in uploaded CSV files."
else:
df_debug = combined_df.to_markdown(index=False)
if os.path.exists(TEXT_STORE):
with open(TEXT_STORE, "r") as f:
document_context = json.load(f)
if os.path.exists(FILE_LOG):
with open(FILE_LOG) as f:
uploaded_files = json.load(f)
full_context = "\n\n".join(document_context[:10]) + "\n\n" + tabular_md
print("🧠 FULL CONTEXT PASSED TO LLM:\n", full_context)
file_listing = "\n".join(f"- {f}" for f in uploaded_files)
llm = OpenAI(openai_api_key=API_KEY)
response_parts = []
llm_logs = []
llm_logs.append("πŸ“„ Data preview (first 5 rows):\n" + combined_df.head().to_markdown(index=False))
if combined_df is not None:
tabular_agent = create_pandas_dataframe_agent(llm, combined_df, verbose=True)
llm_logs.append(f"πŸ“Š DataFrame columns: {combined_df.columns.tolist()}")
llm_logs.append(f"πŸ“ˆ Total rows: {len(combined_df)}")
try:
custom_prompt = f"""
You are a smart data assistant with access to structured tabular data (e.g., Excel files) and unstructured text documents (e.g., Word files).
Use both sources to answer the user's question accurately.
## Tabular Data Preview (first 5 rows):
{combined_df.head().to_markdown(index=False)}
## Column Names:
{combined_df.columns.tolist()}
## Document Context (summary of text snippets):
{document_context[:5]}
## User Query:
{query}
Analyze the documents and table together. Prefer tabular output when answering.
"""
table_answer = tabular_agent.run(custom_prompt)
response_parts.append(f"🧾 Table Agent Answer:\n{table_answer}")
except Exception as e:
response_parts.append(f"⚠️ Table Agent failed: {e}")
if document_context:
vectorstore = FAISS.load_local(PERSIST_DIR, embeddings_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
try:
doc_answer = qa_chain.run(query)
response_parts.append(f"πŸ“š Document QA Answer:\n{doc_answer}")
except Exception as e:
response_parts.append(f"⚠️ Document QA failed: {e}")
if not response_parts:
response_parts.append("⚠️ No relevant data found to answer the query.")
return full_context + "\n\n" + "\n\n".join(response_parts + llm_logs) + "\n\n---\n\n" + df_debug