import streamlit as st from tempfile import NamedTemporaryFile # Page-level look and feel st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide") import pprint import re import os from langchain_community.document_loaders import PyPDFLoader from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders import WebBaseLoader import duckdb import pandas as pd import numpy as np import pprint import requests import json defaultGoogleURL = "https://www.google.com/search?q=google+earnings" OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free" DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv" # Input for OpenRouter API Key OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"] if not OPEN_ROUTER_KEY: st.warning("Please enter your OpenRouter API Key to proceed.") st.stop() def call_openrouter(content: str) -> str: """Send a chat request to OpenRouter and return a safe string response.""" try: response = requests.post( url="https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPEN_ROUTER_KEY}", "Content-Type": "application/json" }, data=json.dumps({ "model": OPEN_ROUTER_MODEL, "messages": [ { "role": "user", "content": content } ] }), timeout=60, ) except Exception as exc: return f"Request error: {exc}" if not response.ok: # Return status code plus body so the user knows what went wrong. return f"Request failed ({response.status_code}): {response.text}" try: data = response.json() except Exception as exc: return f"Invalid JSON response: {exc} | body: {response.text}" try: return data["choices"][0]["message"]["content"] except Exception: return f"Unexpected response format: {data}" def call_openrouter_messages(messages) -> str: """Generic OpenRouter call that accepts a messages list.""" try: response = requests.post( url="https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPEN_ROUTER_KEY}", "Content-Type": "application/json" }, data=json.dumps({ "model": OPEN_ROUTER_MODEL, "messages": messages }), timeout=60, ) except Exception as exc: return f"Request error: {exc}" if not response.ok: return f"Request failed ({response.status_code}): {response.text}" try: data = response.json() except Exception as exc: return f"Invalid JSON response: {exc} | body: {response.text}" try: return data["choices"][0]["message"]["content"] except Exception: return f"Unexpected response format: {data}" def ask_llm(question: str, schema_text: str) -> str: """Ask the model to generate a DuckDB SQL query for the given question and schema.""" messages = [ { "role": "system", "content": f""" You are a data analyst. You MUST use ONLY this table: - Table name: data Schema: {schema_text} Rules: - Use ONLY table name "data" - Return ONE valid DuckDB SQL query - Do NOT explain - Do NOT use markdown """ }, {"role": "user", "content": question}, ] return call_openrouter_messages(messages) def explain_result(question: str, df: pd.DataFrame) -> str: """Ask the model to explain the result set in plain language.""" try: result_text = df.to_string(index=False) except Exception: result_text = str(df) messages = [ { "role": "system", "content": """ You are a data analyst. Given a user's question and a query result, produce a concise, human-like explanation. Rules: - Do NOT mention SQL, databases, or tables - Do NOT explain how the data was computed - Be clear and business-friendly """ }, { "role": "user", "content": f""" Question: {question} Query Result: {result_text} """ }, ] return call_openrouter_messages(messages) def sanitize_dataframe(df: pd.DataFrame): """Return a copy of df with column names sanitized for SQL identifiers.""" if df is None or not isinstance(df, pd.DataFrame): return df, {} rename_map = {} used = set() for col in df.columns: new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col)) new_col = new_col.strip("_") if re.match(r"^[0-9]", new_col): new_col = f"col_{new_col}" if not new_col: new_col = "col" base = new_col idx = 1 while new_col in used: new_col = f"{base}_{idx}" idx += 1 used.add(new_col) rename_map[col] = new_col return df.rename(columns=rename_map), rename_map def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str: """Generate SQL via LLM, run it on DuckDB, and explain the result.""" if not question.strip(): return "Please enter a question." if dataframe is None or not isinstance(dataframe, pd.DataFrame): return "No CSV data loaded." clean_df, rename_map = sanitize_dataframe(dataframe) con = duckdb.connect() try: con.register("data", clean_df) schema_df = con.execute("DESCRIBE data").fetch_df() schema_text = schema_df.to_string(index=False) sql = ask_llm(question, schema_text) if not isinstance(sql, str): return f"Unexpected SQL response: {sql}" sql = sql.strip().strip(";") sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE) result_df = con.execute(sql).fetch_df() except Exception as exc: return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}" finally: con.close() return explain_result(question, result_df) def format_data_preview(data, max_chars: int = 12000) -> str: """Return a trimmed, human-friendly preview to keep prompts under token limits.""" if data is None: return "No data loaded." try: if isinstance(data, pd.DataFrame): preview = data.head(20).to_csv(index=False) elif isinstance(data, list): chunks = [] for doc in data[:5]: text = getattr(doc, "page_content", str(doc)) if len(text) > 1500: text = text[:1500] + "...[truncated]" chunks.append(text) preview = "\n\n".join(chunks) else: preview = str(data) except Exception as exc: preview = f"Could not format data preview: {exc}" if len(preview) > max_chars: preview = preview[:max_chars] + "...[truncated]" return preview def summarize_csv(dataframe: pd.DataFrame) -> str: """Build a compact summary (top items, payment mix) from a CSV DataFrame.""" if dataframe is None or not isinstance(dataframe, pd.DataFrame): return "" summary_lines = [] quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None) desc_col = None for candidate in ("Description", "Product", "Item", "Product_Name"): if candidate in dataframe.columns: desc_col = candidate break payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None) if quantity_col and desc_col: try: top_items = ( dataframe.groupby(desc_col)[quantity_col] .sum() .sort_values(ascending=False) .head(10) ) summary_lines.append("Top items by quantity (sum):") summary_lines.append(top_items.to_string()) except Exception as exc: summary_lines.append(f"Could not compute top items: {exc}") if payment_col: try: payment_counts = dataframe[payment_col].value_counts().head(10) summary_lines.append("\nPayment method counts:") summary_lines.append(payment_counts.to_string()) except Exception as exc: summary_lines.append(f"Could not compute payment counts: {exc}") return "\n".join(summary_lines) def build_prompt(label: str, data, question: str, summary: str = "") -> str: preview = format_data_preview(data) summary_text = summary.strip() summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else "" return f"""Do not reply with a python code. Data preview ({label}, truncated to avoid context limits): {preview} {summary_block} User question: {question} """ def pretty_print_columns(text): """ Beautifies the provided CSV column description text. Args: text (str): The input string containing the column descriptions. Returns: str: The beautified string with neatly formatted column descriptions. """ return " ".join([line.strip() for line in text.splitlines() if line.strip()]) radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)", "Upload my own CSV", "Upload my own PDF", f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})", "Enter my own URL"] # Visual polish st.markdown(""" """, unsafe_allow_html=True) st.markdown("""
Document & Data Copilot
Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.