Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from tempfile import NamedTemporaryFile | |
| # Page-level look and feel | |
| st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide") | |
| import pprint | |
| import re | |
| import os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.document_loaders.csv_loader import CSVLoader | |
| from langchain.document_loaders import WebBaseLoader | |
| import duckdb | |
| import pandas as pd | |
| import numpy as np | |
| import pprint | |
| import requests | |
| import json | |
| defaultGoogleURL = "https://www.google.com/search?q=google+earnings" | |
| OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free" | |
| DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv" | |
| # Input for OpenRouter API Key | |
| OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"] | |
| if not OPEN_ROUTER_KEY: | |
| st.warning("Please enter your OpenRouter API Key to proceed.") | |
| st.stop() | |
| def call_openrouter(content: str) -> str: | |
| """Send a chat request to OpenRouter and return a safe string response.""" | |
| try: | |
| response = requests.post( | |
| url="https://openrouter.ai/api/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {OPEN_ROUTER_KEY}", | |
| "Content-Type": "application/json" | |
| }, | |
| data=json.dumps({ | |
| "model": OPEN_ROUTER_MODEL, | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": content | |
| } | |
| ] | |
| }), | |
| timeout=60, | |
| ) | |
| except Exception as exc: | |
| return f"Request error: {exc}" | |
| if not response.ok: | |
| # Return status code plus body so the user knows what went wrong. | |
| return f"Request failed ({response.status_code}): {response.text}" | |
| try: | |
| data = response.json() | |
| except Exception as exc: | |
| return f"Invalid JSON response: {exc} | body: {response.text}" | |
| try: | |
| return data["choices"][0]["message"]["content"] | |
| except Exception: | |
| return f"Unexpected response format: {data}" | |
| def call_openrouter_messages(messages) -> str: | |
| """Generic OpenRouter call that accepts a messages list.""" | |
| try: | |
| response = requests.post( | |
| url="https://openrouter.ai/api/v1/chat/completions", | |
| headers={ | |
| "Authorization": f"Bearer {OPEN_ROUTER_KEY}", | |
| "Content-Type": "application/json" | |
| }, | |
| data=json.dumps({ | |
| "model": OPEN_ROUTER_MODEL, | |
| "messages": messages | |
| }), | |
| timeout=60, | |
| ) | |
| except Exception as exc: | |
| return f"Request error: {exc}" | |
| if not response.ok: | |
| return f"Request failed ({response.status_code}): {response.text}" | |
| try: | |
| data = response.json() | |
| except Exception as exc: | |
| return f"Invalid JSON response: {exc} | body: {response.text}" | |
| try: | |
| return data["choices"][0]["message"]["content"] | |
| except Exception: | |
| return f"Unexpected response format: {data}" | |
| def ask_llm(question: str, schema_text: str) -> str: | |
| """Ask the model to generate a DuckDB SQL query for the given question and schema.""" | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": f""" | |
| You are a data analyst. | |
| You MUST use ONLY this table: | |
| - Table name: data | |
| Schema: | |
| {schema_text} | |
| Rules: | |
| - Use ONLY table name "data" | |
| - Return ONE valid DuckDB SQL query | |
| - Do NOT explain | |
| - Do NOT use markdown | |
| """ | |
| }, | |
| {"role": "user", "content": question}, | |
| ] | |
| return call_openrouter_messages(messages) | |
| def explain_result(question: str, df: pd.DataFrame) -> str: | |
| """Ask the model to explain the result set in plain language.""" | |
| try: | |
| result_text = df.to_string(index=False) | |
| except Exception: | |
| result_text = str(df) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": """ | |
| You are a data analyst. | |
| Given a user's question and a query result, | |
| produce a concise, human-like explanation. | |
| Rules: | |
| - Do NOT mention SQL, databases, or tables | |
| - Do NOT explain how the data was computed | |
| - Be clear and business-friendly | |
| """ | |
| }, | |
| { | |
| "role": "user", | |
| "content": f""" | |
| Question: | |
| {question} | |
| Query Result: | |
| {result_text} | |
| """ | |
| }, | |
| ] | |
| return call_openrouter_messages(messages) | |
| def sanitize_dataframe(df: pd.DataFrame): | |
| """Return a copy of df with column names sanitized for SQL identifiers.""" | |
| if df is None or not isinstance(df, pd.DataFrame): | |
| return df, {} | |
| rename_map = {} | |
| used = set() | |
| for col in df.columns: | |
| new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col)) | |
| new_col = new_col.strip("_") | |
| if re.match(r"^[0-9]", new_col): | |
| new_col = f"col_{new_col}" | |
| if not new_col: | |
| new_col = "col" | |
| base = new_col | |
| idx = 1 | |
| while new_col in used: | |
| new_col = f"{base}_{idx}" | |
| idx += 1 | |
| used.add(new_col) | |
| rename_map[col] = new_col | |
| return df.rename(columns=rename_map), rename_map | |
| def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str: | |
| """Generate SQL via LLM, run it on DuckDB, and explain the result.""" | |
| if not question.strip(): | |
| return "Please enter a question." | |
| if dataframe is None or not isinstance(dataframe, pd.DataFrame): | |
| return "No CSV data loaded." | |
| clean_df, rename_map = sanitize_dataframe(dataframe) | |
| con = duckdb.connect() | |
| try: | |
| con.register("data", clean_df) | |
| schema_df = con.execute("DESCRIBE data").fetch_df() | |
| schema_text = schema_df.to_string(index=False) | |
| sql = ask_llm(question, schema_text) | |
| if not isinstance(sql, str): | |
| return f"Unexpected SQL response: {sql}" | |
| sql = sql.strip().strip(";") | |
| sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE) | |
| result_df = con.execute(sql).fetch_df() | |
| except Exception as exc: | |
| return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}" | |
| finally: | |
| con.close() | |
| return explain_result(question, result_df) | |
| def format_data_preview(data, max_chars: int = 12000) -> str: | |
| """Return a trimmed, human-friendly preview to keep prompts under token limits.""" | |
| if data is None: | |
| return "No data loaded." | |
| try: | |
| if isinstance(data, pd.DataFrame): | |
| preview = data.head(20).to_csv(index=False) | |
| elif isinstance(data, list): | |
| chunks = [] | |
| for doc in data[:5]: | |
| text = getattr(doc, "page_content", str(doc)) | |
| if len(text) > 1500: | |
| text = text[:1500] + "...[truncated]" | |
| chunks.append(text) | |
| preview = "\n\n".join(chunks) | |
| else: | |
| preview = str(data) | |
| except Exception as exc: | |
| preview = f"Could not format data preview: {exc}" | |
| if len(preview) > max_chars: | |
| preview = preview[:max_chars] + "...[truncated]" | |
| return preview | |
| def summarize_csv(dataframe: pd.DataFrame) -> str: | |
| """Build a compact summary (top items, payment mix) from a CSV DataFrame.""" | |
| if dataframe is None or not isinstance(dataframe, pd.DataFrame): | |
| return "" | |
| summary_lines = [] | |
| quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None) | |
| desc_col = None | |
| for candidate in ("Description", "Product", "Item", "Product_Name"): | |
| if candidate in dataframe.columns: | |
| desc_col = candidate | |
| break | |
| payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None) | |
| if quantity_col and desc_col: | |
| try: | |
| top_items = ( | |
| dataframe.groupby(desc_col)[quantity_col] | |
| .sum() | |
| .sort_values(ascending=False) | |
| .head(10) | |
| ) | |
| summary_lines.append("Top items by quantity (sum):") | |
| summary_lines.append(top_items.to_string()) | |
| except Exception as exc: | |
| summary_lines.append(f"Could not compute top items: {exc}") | |
| if payment_col: | |
| try: | |
| payment_counts = dataframe[payment_col].value_counts().head(10) | |
| summary_lines.append("\nPayment method counts:") | |
| summary_lines.append(payment_counts.to_string()) | |
| except Exception as exc: | |
| summary_lines.append(f"Could not compute payment counts: {exc}") | |
| return "\n".join(summary_lines) | |
| def build_prompt(label: str, data, question: str, summary: str = "") -> str: | |
| preview = format_data_preview(data) | |
| summary_text = summary.strip() | |
| summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else "" | |
| return f"""Do not reply with a python code. | |
| Data preview ({label}, truncated to avoid context limits): | |
| {preview} | |
| {summary_block} | |
| User question: {question} | |
| """ | |
| def pretty_print_columns(text): | |
| """ | |
| Beautifies the provided CSV column description text. | |
| Args: | |
| text (str): The input string containing the column descriptions. | |
| Returns: | |
| str: The beautified string with neatly formatted column descriptions. | |
| """ | |
| return " ".join([line.strip() for line in text.splitlines() if line.strip()]) | |
| radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)", | |
| "Upload my own CSV", | |
| "Upload my own PDF", | |
| f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})", | |
| "Enter my own URL"] | |
| # Visual polish | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap'); | |
| html, body, [class*="css"] { | |
| font-family: 'Space Grotesk', system-ui, -apple-system, sans-serif; | |
| } | |
| .stApp { | |
| background: radial-gradient(circle at 20% 20%, #e2f2ff 0, #f8fafc 50%, #ffffff 100%); | |
| color: #0f172a; | |
| } | |
| .block-container { | |
| padding-top: 1.5rem; | |
| padding-bottom: 3rem; | |
| padding-left: 2.5rem; | |
| padding-right: 2.5rem; | |
| } | |
| .hero { | |
| position: relative; | |
| overflow: hidden; | |
| padding: 1.5rem 1.8rem; | |
| border-radius: 18px; | |
| border: 1px solid #e5e7eb; | |
| background: linear-gradient(135deg, rgba(59,130,246,0.12), rgba(16,185,129,0.08)); | |
| box-shadow: 0 18px 45px rgba(15, 23, 42, 0.12); | |
| } | |
| .hero:before { | |
| content: ""; | |
| position: absolute; | |
| right: -120px; | |
| top: -80px; | |
| width: 260px; | |
| height: 260px; | |
| background: radial-gradient(circle, rgba(59,130,246,0.15), transparent 55%); | |
| filter: blur(6px); | |
| } | |
| .hero h1 { | |
| margin: 0.15rem 0 0.35rem 0; | |
| font-size: 2rem; | |
| line-height: 1.2; | |
| letter-spacing: -0.02em; | |
| color: #0f172a; | |
| } | |
| .hero p { | |
| color: #0f172a; | |
| opacity: 0.9; | |
| } | |
| .eyebrow { | |
| text-transform: uppercase; | |
| letter-spacing: 0.14em; | |
| font-size: 0.75rem; | |
| font-weight: 700; | |
| color: #0ea5e9; | |
| margin: 0; | |
| } | |
| .pill-row { | |
| display: flex; | |
| gap: 0.5rem; | |
| flex-wrap: wrap; | |
| margin-top: 0.85rem; | |
| } | |
| .pill { | |
| padding: 0.35rem 0.65rem; | |
| border-radius: 10px; | |
| background: rgba(15, 23, 42, 0.08); | |
| font-size: 0.85rem; | |
| font-weight: 600; | |
| } | |
| .section-label { | |
| font-size: 0.85rem; | |
| letter-spacing: 0.06em; | |
| text-transform: uppercase; | |
| color: #475569; | |
| margin-bottom: 0.2rem; | |
| font-weight: 700; | |
| } | |
| .section-card { | |
| background: #ffffff; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 16px; | |
| padding: 1.1rem 1.2rem; | |
| box-shadow: 0 12px 32px rgba(15, 23, 42, 0.08); | |
| } | |
| .section-card.compact { | |
| padding: 0.9rem 1rem; | |
| margin-top: 0.5rem; | |
| } | |
| .prompt-chip { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.35rem; | |
| padding: 0.5rem 0.75rem; | |
| border-radius: 12px; | |
| background: #0ea5e911; | |
| border: 1px solid #bae6fd; | |
| color: #0f172a; | |
| font-weight: 600; | |
| } | |
| .status-pill { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.35rem; | |
| padding: 0.45rem 0.7rem; | |
| border-radius: 999px; | |
| border: 1px solid #e2e8f0; | |
| font-weight: 600; | |
| font-size: 0.9rem; | |
| } | |
| .status-pill.ready { | |
| background: #ecfeff; | |
| border-color: #a5f3fc; | |
| color: #0f172a; | |
| } | |
| .status-pill.idle { | |
| background: #f8fafc; | |
| border-color: #e2e8f0; | |
| color: #475569; | |
| } | |
| .stRadio div[role="radiogroup"] { | |
| display: grid; | |
| gap: 0.4rem; | |
| } | |
| .stRadio div[role="radio"] { | |
| border: 1px solid #e2e8f0; | |
| padding: 0.85rem 1rem; | |
| border-radius: 12px; | |
| background: #f8fafc; | |
| transition: all 0.18s ease-in-out; | |
| box-shadow: 0 8px 22px rgba(15, 23, 42, 0.05); | |
| } | |
| .stRadio div[role="radio"][aria-checked="true"] { | |
| border-color: #2563eb; | |
| background: #ffffff; | |
| box-shadow: 0 18px 40px rgba(37, 99, 235, 0.15); | |
| } | |
| .stRadio div[role="radio"]:hover { | |
| border-color: #3b82f6; | |
| transform: translateY(-1px); | |
| } | |
| .stTextInput>div>div>input { | |
| border-radius: 12px; | |
| border: 1px solid #e2e8f0; | |
| background: #ffffff; | |
| padding: 0.75rem 0.85rem; | |
| } | |
| .stTextInput>div>div>input:focus { | |
| border-color: #2563eb; | |
| box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.15); | |
| } | |
| .stFileUploader { | |
| border-radius: 14px; | |
| border: 1px dashed #cbd5e1; | |
| padding: 0.4rem 0.75rem 0.75rem 0.75rem; | |
| background: #f8fafc; | |
| } | |
| .stButton>button { | |
| background: linear-gradient(135deg, #2563eb, #0ea5e9); | |
| color: #ffffff; | |
| border: none; | |
| padding: 0.75rem 1.35rem; | |
| border-radius: 12px; | |
| font-weight: 700; | |
| letter-spacing: 0.02em; | |
| box-shadow: 0 12px 30px rgba(14, 165, 233, 0.28); | |
| transition: transform 0.12s ease, box-shadow 0.12s ease; | |
| } | |
| .stButton>button:hover { | |
| transform: translateY(-1px); | |
| box-shadow: 0 16px 38px rgba(37, 99, 235, 0.32); | |
| } | |
| .stButton>button:active { | |
| transform: translateY(0); | |
| } | |
| .stButton>button:disabled { | |
| background: #e2e8f0; | |
| color: #94a3b8; | |
| box-shadow: none; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.markdown(""" | |
| <div class="hero"> | |
| <p class="eyebrow">Document & Data Copilot</p> | |
| <h1>Chat with your PDFs, spreadsheets, or live web pages.</h1> | |
| <p>Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.</p> | |
| <div class="pill-row"> | |
| <span class="pill">Summaries</span> | |
| <span class="pill">Follow-up questions</span> | |
| <span class="pill">Trends & metrics</span> | |
| <span class="pill">Plain-language insights</span> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| info_left, info_right = st.columns([1.05, 1]) | |
| with info_left: | |
| st.markdown("#### What you can do") | |
| st.markdown("- Skim long PDFs in a few bullet points\n- Ask for top performers or outliers in CSVs\n- Pull key quotes or facts from a URL\n- Iterate with follow-up questions to refine") | |
| with info_right: | |
| st.markdown("#### Quick tips") | |
| st.markdown("- Keep prompts short and specific\n- Mention the format you want (bullets, table, headline)\n- Ask one question at a time for best results\n- You can chain questions; context is remembered") | |
| st.markdown("### Choose a source to explore") | |
| genre = st.radio( | |
| "Pick the content you want to chat with", radioButtonList, index=0, key="source_radio" | |
| ) | |
| pdfCSVURLText = "" | |
| exampleQuestion = "" | |
| csv_data = None | |
| pdf_pages = None | |
| if genre==radioButtonList[1]: | |
| pdfCSVURLText = "CSV" | |
| exampleQuestion = "What are the data columns?" | |
| elif genre==radioButtonList[2]: | |
| pdfCSVURLText = "PDF" | |
| exampleQuestion = "Can you summarize the contents?" | |
| elif genre==radioButtonList[3]: | |
| pdfCSVURLText = "URL" | |
| exampleQuestion = "What is Google's latest earnings?" | |
| elif genre==radioButtonList[4]: | |
| pdfCSVURLText = "URL" | |
| exampleQuestion = "Can you summarize the contents?" | |
| else: # Default, E-commerce CSV | |
| pdfCSVURLText = "CSV" | |
| exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?" | |
| if os.path.exists(DEFAULT_ECOMMERCE_CSV): | |
| try: | |
| csv_data = pd.read_csv(DEFAULT_ECOMMERCE_CSV) | |
| except Exception as exc: | |
| st.warning(f"Problem loading {DEFAULT_ECOMMERCE_CSV} ({exc}). Falling back to a small sample dataset.") | |
| if csv_data is None: | |
| # Keep a tiny inline sample so the app still works even when the CSV is missing locally. | |
| csv_data = pd.DataFrame( | |
| [ | |
| {"InvoiceNo": "536365", "StockCode": "85123A", "Description": "White hanging heart", "Quantity": 6, "UnitPrice": 2.55, "Country": "United Kingdom"}, | |
| {"InvoiceNo": "536366", "StockCode": "71053", "Description": "White metal lantern", "Quantity": 6, "UnitPrice": 3.39, "Country": "United Kingdom"}, | |
| {"InvoiceNo": "536367", "StockCode": "84406B", "Description": "Pink mini hanging heart", "Quantity": 8, "UnitPrice": 1.65, "Country": "United Kingdom"}, | |
| ] | |
| ) | |
| st.info(f"{DEFAULT_ECOMMERCE_CSV} not found. Using an inline sample instead. Upload your own CSV if you need the full dataset.") | |
| st.markdown("### Add your data") | |
| st.caption("Upload a CSV/PDF or paste a URL. The built-in e-commerce sample is ready immediately.") | |
| if exampleQuestion: | |
| st.markdown( | |
| f""" | |
| <div class="section-card compact"> | |
| <div class="section-label">Suggested prompt</div> | |
| <div class="prompt-chip">{exampleQuestion}</div> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| isCustomURL = genre==radioButtonList[4] | |
| urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL) | |
| isCustomUpload = genre==radioButtonList[1] or genre==radioButtonList[2] | |
| uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomUpload) | |
| uploadedFilename = "" | |
| if uploaded_file is not None: | |
| if genre==radioButtonList[1]: # Custom CSV Upload | |
| try: | |
| csv_data = pd.read_csv(uploaded_file) | |
| except Exception as exc: | |
| st.error(f"Could not read uploaded CSV: {exc}") | |
| elif genre==radioButtonList[2]: # Custom PDF Upload | |
| with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}', delete=False) as f: | |
| f.write(uploaded_file.getbuffer()) | |
| uploadedFilename = f.name | |
| try: | |
| loader = PyPDFLoader(uploadedFilename) | |
| pdf_pages = loader.load_and_split() | |
| except Exception as exc: | |
| st.error(f"Could not read uploaded PDF: {exc}") | |
| finally: | |
| if uploadedFilename and os.path.exists(uploadedFilename): | |
| os.remove(uploadedFilename) | |
| enableChatBox = False | |
| if genre==radioButtonList[1]: # Custom CSV Upload | |
| enableChatBox = isinstance(csv_data, pd.DataFrame) | |
| elif genre==radioButtonList[2]: # Custom PDF Upload | |
| enableChatBox = pdf_pages is not None | |
| elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report | |
| enableChatBox = True | |
| elif genre==radioButtonList[4]: # Custom URL | |
| enableChatBox = True | |
| else: # E-commerce CSV | |
| enableChatBox = True | |
| status_class = "ready" if enableChatBox else "idle" | |
| status_text = "Ready to chat" if enableChatBox else "Load a file or URL to start" | |
| st.markdown(f'<div class="status-pill {status_class}">{status_text}</div>', unsafe_allow_html=True) | |
| st.markdown("### Ask a question") | |
| st.caption("Short, specific prompts work best. You can ask follow-ups without reloading.") | |
| chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox) | |
| chatWithPDFButton = "CLICK HERE TO START CHATTING" | |
| if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked | |
| if genre==radioButtonList[0]: # E-commerce CSV | |
| st.write(run_duckdb_qa(chatTextStr, csv_data)) | |
| elif genre==radioButtonList[1]: # Custom CSV Upload | |
| st.write(run_duckdb_qa(chatTextStr, csv_data)) | |
| elif genre==radioButtonList[2]: # Custom PDF Upload | |
| content = build_prompt("Uploaded PDF", pdf_pages, chatTextStr) | |
| st.write(call_openrouter(content)) | |
| elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report | |
| loader = WebBaseLoader(defaultGoogleURL) | |
| web_data = loader.load() | |
| content = build_prompt("Google earnings URL", web_data, chatTextStr) | |
| st.write(call_openrouter(content)) | |
| elif genre==radioButtonList[4]: # Custom URL | |
| if not urlInput.strip(): | |
| st.warning("Please enter a URL first.") | |
| else: | |
| loader = WebBaseLoader(urlInput) | |
| web_data = loader.load() | |
| content = build_prompt("Custom URL", web_data, chatTextStr) | |
| st.write(call_openrouter(content)) | |