Spaces:

PRSHNTKUMR
/

gradio_RAG_CHAT

Runtime error

App Files Files Community

gradio_RAG_CHAT / app.py

PRSHNTKUMR

Update app.py

6273e84 verified 12 months ago

raw

history blame contribute delete

6.29 kB

	# 🧠 Gradio RAG App for Hugging Face Deployment (Multi-File Support)

	# --- Imports ---
	import os
	import io
	import json
	import pandas as pd
	from dotenv import load_dotenv, find_dotenv
	from langchain_community.embeddings import OpenAIEmbeddings
	from langchain_community.chat_models import ChatOpenAI as OpenAI
	from langchain_community.vectorstores import FAISS
	from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains import RetrievalQA
	import PyPDF2
	from docx import Document

	# --- Load API Key ---
	_ = load_dotenv(find_dotenv())
	API_KEY = os.getenv("OPENAI_API_KEY")
	if not API_KEY:
	raise ValueError("❌ OPENAI_API_KEY is missing. Please add it to .env or set as environment variable.")

	embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY)
	PERSIST_DIR = "persist_data"
	os.makedirs(PERSIST_DIR, exist_ok=True)
	TEXT_STORE = os.path.join(PERSIST_DIR, "text_chunks.json")
	FILE_LOG = os.path.join(PERSIST_DIR, "uploaded_files.json")

	# --- Agent 1: File Ingestion and Indexing ---
	def agent_alpha_file_uploader(file_obj, file_type, filename):
	if not os.path.exists(FILE_LOG):
	with open(FILE_LOG, "w") as f:
	json.dump([], f)

	with open(FILE_LOG, "r+") as f:
	files = json.load(f)
	if filename not in files:
	files.append(filename)
	f.seek(0)
	json.dump(files, f)
	f.truncate()

	if file_type in ["csv", "xlsx", "json"]:
	if file_type == "csv":
	df = pd.read_csv(file_obj)
	elif file_type == "xlsx":
	import openpyxl
	df = pd.read_excel(file_obj, engine="openpyxl", header=0)
	elif file_type == "json":
	df = pd.DataFrame(json.load(file_obj))
	df.to_csv(os.path.join(PERSIST_DIR, filename + ".csv"), index=False)
	return df, None, f"✅ {filename} stored for tabular querying."

	elif file_type in ["pdf", "docx"]:
	text = extract_text_from_file(file_obj, file_type)
	chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text)

	if os.path.exists(TEXT_STORE):
	with open(TEXT_STORE, "r") as f:
	all_chunks = json.load(f)
	else:
	all_chunks = []

	all_chunks.extend(chunks)
	with open(TEXT_STORE, "w") as f:
	json.dump(all_chunks, f)

	vectorstore = FAISS.from_texts(all_chunks, embeddings_model)
	vectorstore.save_local(PERSIST_DIR)
	return None, vectorstore, f"✅ {filename} embedded into vectorstore."
	else:
	raise ValueError("Unsupported file type.")

	# --- Agent 2: Query Resolution ---
	def agent_theta_tabular():
	combined_df = []
	markdown_blocks = []
	for file in os.listdir(PERSIST_DIR):
	if file.endswith(".csv"):
	try:
	df = pd.read_csv(os.path.join(PERSIST_DIR, file))
	combined_df.append(df)
	markdown_blocks.append(f"### File: {file}\n" + df.head(5).to_markdown(index=False))
	except Exception as e:
	print(f"❌ Failed to load {file}: {e}")
	tabular_md = "\n\n".join(markdown_blocks)
	return pd.concat(combined_df, ignore_index=True) if combined_df else None, tabular_md

	def agent_beta_query_processor(query):
	if not os.path.exists(FILE_LOG) and not os.path.exists(TEXT_STORE):
	llm = OpenAI(openai_api_key=API_KEY)
	try:
	return llm.invoke(query).content
	except Exception as e:
	return f"❌ LLM Chat Failed: {e}"

	df_debug = ""
	document_context = []
	uploaded_files = []
	combined_df, tabular_md = agent_theta_tabular()
	if combined_df is None or combined_df.empty:
	df_debug = "⚠️ No rows found in uploaded CSV files."
	else:
	df_debug = combined_df.to_markdown(index=False)

	if os.path.exists(TEXT_STORE):
	with open(TEXT_STORE, "r") as f:
	document_context = json.load(f)

	if os.path.exists(FILE_LOG):
	with open(FILE_LOG) as f:
	uploaded_files = json.load(f)

	full_context = "\n\n".join(document_context[:10]) + "\n\n" + tabular_md
	print("🧠 FULL CONTEXT PASSED TO LLM:\n", full_context)
	file_listing = "\n".join(f"- {f}" for f in uploaded_files)

	llm = OpenAI(openai_api_key=API_KEY)
	response_parts = []
	llm_logs = []
	llm_logs.append("📄 Data preview (first 5 rows):\n" + combined_df.head().to_markdown(index=False))

	if combined_df is not None:
	tabular_agent = create_pandas_dataframe_agent(llm, combined_df, verbose=True)
	llm_logs.append(f"📊 DataFrame columns: {combined_df.columns.tolist()}")
	llm_logs.append(f"📈 Total rows: {len(combined_df)}")
	try:
	custom_prompt = f"""
	You are a smart data assistant with access to structured tabular data (e.g., Excel files) and unstructured text documents (e.g., Word files).

	Use both sources to answer the user's question accurately.

	## Tabular Data Preview (first 5 rows):
	{combined_df.head().to_markdown(index=False)}

	## Column Names:
	{combined_df.columns.tolist()}

	## Document Context (summary of text snippets):
	{document_context[:5]}

	## User Query:
	{query}

	Analyze the documents and table together. Prefer tabular output when answering.
	"""
	table_answer = tabular_agent.run(custom_prompt)
	response_parts.append(f"🧾 Table Agent Answer:\n{table_answer}")
	except Exception as e:
	response_parts.append(f"⚠️ Table Agent failed: {e}")

	if document_context:
	vectorstore = FAISS.load_local(PERSIST_DIR, embeddings_model)
	retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
	qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
	try:
	doc_answer = qa_chain.run(query)
	response_parts.append(f"📚 Document QA Answer:\n{doc_answer}")
	except Exception as e:
	response_parts.append(f"⚠️ Document QA failed: {e}")

	if not response_parts:
	response_parts.append("⚠️ No relevant data found to answer the query.")

	return full_context + "\n\n" + "\n\n".join(response_parts + llm_logs) + "\n\n---\n\n" + df_debug