Spaces:
Runtime error
Runtime error
| # π§ Gradio RAG App for Hugging Face Deployment (Multi-File Support) | |
| # --- Imports --- | |
| import os | |
| import io | |
| import json | |
| import pandas as pd | |
| from dotenv import load_dotenv, find_dotenv | |
| from langchain_community.embeddings import OpenAIEmbeddings | |
| from langchain_community.chat_models import ChatOpenAI as OpenAI | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| import PyPDF2 | |
| from docx import Document | |
| # --- Load API Key --- | |
| _ = load_dotenv(find_dotenv()) | |
| API_KEY = os.getenv("OPENAI_API_KEY") | |
| if not API_KEY: | |
| raise ValueError("β OPENAI_API_KEY is missing. Please add it to .env or set as environment variable.") | |
| embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY) | |
| PERSIST_DIR = "persist_data" | |
| os.makedirs(PERSIST_DIR, exist_ok=True) | |
| TEXT_STORE = os.path.join(PERSIST_DIR, "text_chunks.json") | |
| FILE_LOG = os.path.join(PERSIST_DIR, "uploaded_files.json") | |
| # --- Agent 1: File Ingestion and Indexing --- | |
| def agent_alpha_file_uploader(file_obj, file_type, filename): | |
| if not os.path.exists(FILE_LOG): | |
| with open(FILE_LOG, "w") as f: | |
| json.dump([], f) | |
| with open(FILE_LOG, "r+") as f: | |
| files = json.load(f) | |
| if filename not in files: | |
| files.append(filename) | |
| f.seek(0) | |
| json.dump(files, f) | |
| f.truncate() | |
| if file_type in ["csv", "xlsx", "json"]: | |
| if file_type == "csv": | |
| df = pd.read_csv(file_obj) | |
| elif file_type == "xlsx": | |
| import openpyxl | |
| df = pd.read_excel(file_obj, engine="openpyxl", header=0) | |
| elif file_type == "json": | |
| df = pd.DataFrame(json.load(file_obj)) | |
| df.to_csv(os.path.join(PERSIST_DIR, filename + ".csv"), index=False) | |
| return df, None, f"β {filename} stored for tabular querying." | |
| elif file_type in ["pdf", "docx"]: | |
| text = extract_text_from_file(file_obj, file_type) | |
| chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(text) | |
| if os.path.exists(TEXT_STORE): | |
| with open(TEXT_STORE, "r") as f: | |
| all_chunks = json.load(f) | |
| else: | |
| all_chunks = [] | |
| all_chunks.extend(chunks) | |
| with open(TEXT_STORE, "w") as f: | |
| json.dump(all_chunks, f) | |
| vectorstore = FAISS.from_texts(all_chunks, embeddings_model) | |
| vectorstore.save_local(PERSIST_DIR) | |
| return None, vectorstore, f"β {filename} embedded into vectorstore." | |
| else: | |
| raise ValueError("Unsupported file type.") | |
| # --- Agent 2: Query Resolution --- | |
| def agent_theta_tabular(): | |
| combined_df = [] | |
| markdown_blocks = [] | |
| for file in os.listdir(PERSIST_DIR): | |
| if file.endswith(".csv"): | |
| try: | |
| df = pd.read_csv(os.path.join(PERSIST_DIR, file)) | |
| combined_df.append(df) | |
| markdown_blocks.append(f"### File: {file}\n" + df.head(5).to_markdown(index=False)) | |
| except Exception as e: | |
| print(f"β Failed to load {file}: {e}") | |
| tabular_md = "\n\n".join(markdown_blocks) | |
| return pd.concat(combined_df, ignore_index=True) if combined_df else None, tabular_md | |
| def agent_beta_query_processor(query): | |
| if not os.path.exists(FILE_LOG) and not os.path.exists(TEXT_STORE): | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| try: | |
| return llm.invoke(query).content | |
| except Exception as e: | |
| return f"β LLM Chat Failed: {e}" | |
| df_debug = "" | |
| document_context = [] | |
| uploaded_files = [] | |
| combined_df, tabular_md = agent_theta_tabular() | |
| if combined_df is None or combined_df.empty: | |
| df_debug = "β οΈ No rows found in uploaded CSV files." | |
| else: | |
| df_debug = combined_df.to_markdown(index=False) | |
| if os.path.exists(TEXT_STORE): | |
| with open(TEXT_STORE, "r") as f: | |
| document_context = json.load(f) | |
| if os.path.exists(FILE_LOG): | |
| with open(FILE_LOG) as f: | |
| uploaded_files = json.load(f) | |
| full_context = "\n\n".join(document_context[:10]) + "\n\n" + tabular_md | |
| print("π§ FULL CONTEXT PASSED TO LLM:\n", full_context) | |
| file_listing = "\n".join(f"- {f}" for f in uploaded_files) | |
| llm = OpenAI(openai_api_key=API_KEY) | |
| response_parts = [] | |
| llm_logs = [] | |
| llm_logs.append("π Data preview (first 5 rows):\n" + combined_df.head().to_markdown(index=False)) | |
| if combined_df is not None: | |
| tabular_agent = create_pandas_dataframe_agent(llm, combined_df, verbose=True) | |
| llm_logs.append(f"π DataFrame columns: {combined_df.columns.tolist()}") | |
| llm_logs.append(f"π Total rows: {len(combined_df)}") | |
| try: | |
| custom_prompt = f""" | |
| You are a smart data assistant with access to structured tabular data (e.g., Excel files) and unstructured text documents (e.g., Word files). | |
| Use both sources to answer the user's question accurately. | |
| ## Tabular Data Preview (first 5 rows): | |
| {combined_df.head().to_markdown(index=False)} | |
| ## Column Names: | |
| {combined_df.columns.tolist()} | |
| ## Document Context (summary of text snippets): | |
| {document_context[:5]} | |
| ## User Query: | |
| {query} | |
| Analyze the documents and table together. Prefer tabular output when answering. | |
| """ | |
| table_answer = tabular_agent.run(custom_prompt) | |
| response_parts.append(f"π§Ύ Table Agent Answer:\n{table_answer}") | |
| except Exception as e: | |
| response_parts.append(f"β οΈ Table Agent failed: {e}") | |
| if document_context: | |
| vectorstore = FAISS.load_local(PERSIST_DIR, embeddings_model) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) | |
| qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) | |
| try: | |
| doc_answer = qa_chain.run(query) | |
| response_parts.append(f"π Document QA Answer:\n{doc_answer}") | |
| except Exception as e: | |
| response_parts.append(f"β οΈ Document QA failed: {e}") | |
| if not response_parts: | |
| response_parts.append("β οΈ No relevant data found to answer the query.") | |
| return full_context + "\n\n" + "\n\n".join(response_parts + llm_logs) + "\n\n---\n\n" + df_debug | |