import streamlit as st from tempfile import NamedTemporaryFile # Page-level look and feel st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide") import pprint import re import os from langchain_community.document_loaders import PyPDFLoader from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders import WebBaseLoader import duckdb import pandas as pd import numpy as np import pprint import requests import json defaultGoogleURL = "https://www.google.com/search?q=google+earnings" OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free" DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv" # Input for OpenRouter API Key OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"] if not OPEN_ROUTER_KEY: st.warning("Please enter your OpenRouter API Key to proceed.") st.stop() def call_openrouter(content: str) -> str: """Send a chat request to OpenRouter and return a safe string response.""" try: response = requests.post( url="https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPEN_ROUTER_KEY}", "Content-Type": "application/json" }, data=json.dumps({ "model": OPEN_ROUTER_MODEL, "messages": [ { "role": "user", "content": content } ] }), timeout=60, ) except Exception as exc: return f"Request error: {exc}" if not response.ok: # Return status code plus body so the user knows what went wrong. return f"Request failed ({response.status_code}): {response.text}" try: data = response.json() except Exception as exc: return f"Invalid JSON response: {exc} | body: {response.text}" try: return data["choices"][0]["message"]["content"] except Exception: return f"Unexpected response format: {data}" def call_openrouter_messages(messages) -> str: """Generic OpenRouter call that accepts a messages list.""" try: response = requests.post( url="https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPEN_ROUTER_KEY}", "Content-Type": "application/json" }, data=json.dumps({ "model": OPEN_ROUTER_MODEL, "messages": messages }), timeout=60, ) except Exception as exc: return f"Request error: {exc}" if not response.ok: return f"Request failed ({response.status_code}): {response.text}" try: data = response.json() except Exception as exc: return f"Invalid JSON response: {exc} | body: {response.text}" try: return data["choices"][0]["message"]["content"] except Exception: return f"Unexpected response format: {data}" def ask_llm(question: str, schema_text: str) -> str: """Ask the model to generate a DuckDB SQL query for the given question and schema.""" messages = [ { "role": "system", "content": f""" You are a data analyst. You MUST use ONLY this table: - Table name: data Schema: {schema_text} Rules: - Use ONLY table name "data" - Return ONE valid DuckDB SQL query - Do NOT explain - Do NOT use markdown """ }, {"role": "user", "content": question}, ] return call_openrouter_messages(messages) def explain_result(question: str, df: pd.DataFrame) -> str: """Ask the model to explain the result set in plain language.""" try: result_text = df.to_string(index=False) except Exception: result_text = str(df) messages = [ { "role": "system", "content": """ You are a data analyst. Given a user's question and a query result, produce a concise, human-like explanation. Rules: - Do NOT mention SQL, databases, or tables - Do NOT explain how the data was computed - Be clear and business-friendly """ }, { "role": "user", "content": f""" Question: {question} Query Result: {result_text} """ }, ] return call_openrouter_messages(messages) def sanitize_dataframe(df: pd.DataFrame): """Return a copy of df with column names sanitized for SQL identifiers.""" if df is None or not isinstance(df, pd.DataFrame): return df, {} rename_map = {} used = set() for col in df.columns: new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col)) new_col = new_col.strip("_") if re.match(r"^[0-9]", new_col): new_col = f"col_{new_col}" if not new_col: new_col = "col" base = new_col idx = 1 while new_col in used: new_col = f"{base}_{idx}" idx += 1 used.add(new_col) rename_map[col] = new_col return df.rename(columns=rename_map), rename_map def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str: """Generate SQL via LLM, run it on DuckDB, and explain the result.""" if not question.strip(): return "Please enter a question." if dataframe is None or not isinstance(dataframe, pd.DataFrame): return "No CSV data loaded." clean_df, rename_map = sanitize_dataframe(dataframe) con = duckdb.connect() try: con.register("data", clean_df) schema_df = con.execute("DESCRIBE data").fetch_df() schema_text = schema_df.to_string(index=False) sql = ask_llm(question, schema_text) if not isinstance(sql, str): return f"Unexpected SQL response: {sql}" sql = sql.strip().strip(";") sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE) result_df = con.execute(sql).fetch_df() except Exception as exc: return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}" finally: con.close() return explain_result(question, result_df) def format_data_preview(data, max_chars: int = 12000) -> str: """Return a trimmed, human-friendly preview to keep prompts under token limits.""" if data is None: return "No data loaded." try: if isinstance(data, pd.DataFrame): preview = data.head(20).to_csv(index=False) elif isinstance(data, list): chunks = [] for doc in data[:5]: text = getattr(doc, "page_content", str(doc)) if len(text) > 1500: text = text[:1500] + "...[truncated]" chunks.append(text) preview = "\n\n".join(chunks) else: preview = str(data) except Exception as exc: preview = f"Could not format data preview: {exc}" if len(preview) > max_chars: preview = preview[:max_chars] + "...[truncated]" return preview def summarize_csv(dataframe: pd.DataFrame) -> str: """Build a compact summary (top items, payment mix) from a CSV DataFrame.""" if dataframe is None or not isinstance(dataframe, pd.DataFrame): return "" summary_lines = [] quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None) desc_col = None for candidate in ("Description", "Product", "Item", "Product_Name"): if candidate in dataframe.columns: desc_col = candidate break payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None) if quantity_col and desc_col: try: top_items = ( dataframe.groupby(desc_col)[quantity_col] .sum() .sort_values(ascending=False) .head(10) ) summary_lines.append("Top items by quantity (sum):") summary_lines.append(top_items.to_string()) except Exception as exc: summary_lines.append(f"Could not compute top items: {exc}") if payment_col: try: payment_counts = dataframe[payment_col].value_counts().head(10) summary_lines.append("\nPayment method counts:") summary_lines.append(payment_counts.to_string()) except Exception as exc: summary_lines.append(f"Could not compute payment counts: {exc}") return "\n".join(summary_lines) def build_prompt(label: str, data, question: str, summary: str = "") -> str: preview = format_data_preview(data) summary_text = summary.strip() summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else "" return f"""Do not reply with a python code. Data preview ({label}, truncated to avoid context limits): {preview} {summary_block} User question: {question} """ def pretty_print_columns(text): """ Beautifies the provided CSV column description text. Args: text (str): The input string containing the column descriptions. Returns: str: The beautified string with neatly formatted column descriptions. """ return " ".join([line.strip() for line in text.splitlines() if line.strip()]) radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)", "Upload my own CSV", "Upload my own PDF", f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})", "Enter my own URL"] # Visual polish st.markdown(""" """, unsafe_allow_html=True) st.markdown("""

Document & Data Copilot

Chat with your PDFs, spreadsheets, or live web pages.

Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.

Summaries Follow-up questions Trends & metrics Plain-language insights
""", unsafe_allow_html=True) info_left, info_right = st.columns([1.05, 1]) with info_left: st.markdown("#### What you can do") st.markdown("- Skim long PDFs in a few bullet points\n- Ask for top performers or outliers in CSVs\n- Pull key quotes or facts from a URL\n- Iterate with follow-up questions to refine") with info_right: st.markdown("#### Quick tips") st.markdown("- Keep prompts short and specific\n- Mention the format you want (bullets, table, headline)\n- Ask one question at a time for best results\n- You can chain questions; context is remembered") st.markdown("### Choose a source to explore") genre = st.radio( "Pick the content you want to chat with", radioButtonList, index=0, key="source_radio" ) pdfCSVURLText = "" exampleQuestion = "" csv_data = None pdf_pages = None if genre==radioButtonList[1]: pdfCSVURLText = "CSV" exampleQuestion = "What are the data columns?" elif genre==radioButtonList[2]: pdfCSVURLText = "PDF" exampleQuestion = "Can you summarize the contents?" elif genre==radioButtonList[3]: pdfCSVURLText = "URL" exampleQuestion = "What is Google's latest earnings?" elif genre==radioButtonList[4]: pdfCSVURLText = "URL" exampleQuestion = "Can you summarize the contents?" else: # Default, E-commerce CSV pdfCSVURLText = "CSV" exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?" if os.path.exists(DEFAULT_ECOMMERCE_CSV): try: csv_data = pd.read_csv(DEFAULT_ECOMMERCE_CSV) except Exception as exc: st.warning(f"Problem loading {DEFAULT_ECOMMERCE_CSV} ({exc}). Falling back to a small sample dataset.") if csv_data is None: # Keep a tiny inline sample so the app still works even when the CSV is missing locally. csv_data = pd.DataFrame( [ {"InvoiceNo": "536365", "StockCode": "85123A", "Description": "White hanging heart", "Quantity": 6, "UnitPrice": 2.55, "Country": "United Kingdom"}, {"InvoiceNo": "536366", "StockCode": "71053", "Description": "White metal lantern", "Quantity": 6, "UnitPrice": 3.39, "Country": "United Kingdom"}, {"InvoiceNo": "536367", "StockCode": "84406B", "Description": "Pink mini hanging heart", "Quantity": 8, "UnitPrice": 1.65, "Country": "United Kingdom"}, ] ) st.info(f"{DEFAULT_ECOMMERCE_CSV} not found. Using an inline sample instead. Upload your own CSV if you need the full dataset.") st.markdown("### Add your data") st.caption("Upload a CSV/PDF or paste a URL. The built-in e-commerce sample is ready immediately.") if exampleQuestion: st.markdown( f"""
Suggested prompt
{exampleQuestion}
""", unsafe_allow_html=True, ) isCustomURL = genre==radioButtonList[4] urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL) isCustomUpload = genre==radioButtonList[1] or genre==radioButtonList[2] uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomUpload) uploadedFilename = "" if uploaded_file is not None: if genre==radioButtonList[1]: # Custom CSV Upload try: csv_data = pd.read_csv(uploaded_file) except Exception as exc: st.error(f"Could not read uploaded CSV: {exc}") elif genre==radioButtonList[2]: # Custom PDF Upload with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}', delete=False) as f: f.write(uploaded_file.getbuffer()) uploadedFilename = f.name try: loader = PyPDFLoader(uploadedFilename) pdf_pages = loader.load_and_split() except Exception as exc: st.error(f"Could not read uploaded PDF: {exc}") finally: if uploadedFilename and os.path.exists(uploadedFilename): os.remove(uploadedFilename) enableChatBox = False if genre==radioButtonList[1]: # Custom CSV Upload enableChatBox = isinstance(csv_data, pd.DataFrame) elif genre==radioButtonList[2]: # Custom PDF Upload enableChatBox = pdf_pages is not None elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report enableChatBox = True elif genre==radioButtonList[4]: # Custom URL enableChatBox = True else: # E-commerce CSV enableChatBox = True status_class = "ready" if enableChatBox else "idle" status_text = "Ready to chat" if enableChatBox else "Load a file or URL to start" st.markdown(f'
{status_text}
', unsafe_allow_html=True) st.markdown("### Ask a question") st.caption("Short, specific prompts work best. You can ask follow-ups without reloading.") chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox) chatWithPDFButton = "CLICK HERE TO START CHATTING" if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked if genre==radioButtonList[0]: # E-commerce CSV st.write(run_duckdb_qa(chatTextStr, csv_data)) elif genre==radioButtonList[1]: # Custom CSV Upload st.write(run_duckdb_qa(chatTextStr, csv_data)) elif genre==radioButtonList[2]: # Custom PDF Upload content = build_prompt("Uploaded PDF", pdf_pages, chatTextStr) st.write(call_openrouter(content)) elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report loader = WebBaseLoader(defaultGoogleURL) web_data = loader.load() content = build_prompt("Google earnings URL", web_data, chatTextStr) st.write(call_openrouter(content)) elif genre==radioButtonList[4]: # Custom URL if not urlInput.strip(): st.warning("Please enter a URL first.") else: loader = WebBaseLoader(urlInput) web_data = loader.load() content = build_prompt("Custom URL", web_data, chatTextStr) st.write(call_openrouter(content))