pdfchat / app.py
markytools's picture
beautified the UI
2f04502
import streamlit as st
from tempfile import NamedTemporaryFile
# Page-level look and feel
st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide")
import pprint
import re
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import WebBaseLoader
import duckdb
import pandas as pd
import numpy as np
import pprint
import requests
import json
defaultGoogleURL = "https://www.google.com/search?q=google+earnings"
OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free"
DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv"
# Input for OpenRouter API Key
OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"]
if not OPEN_ROUTER_KEY:
st.warning("Please enter your OpenRouter API Key to proceed.")
st.stop()
def call_openrouter(content: str) -> str:
"""Send a chat request to OpenRouter and return a safe string response."""
try:
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPEN_ROUTER_KEY}",
"Content-Type": "application/json"
},
data=json.dumps({
"model": OPEN_ROUTER_MODEL,
"messages": [
{
"role": "user",
"content": content
}
]
}),
timeout=60,
)
except Exception as exc:
return f"Request error: {exc}"
if not response.ok:
# Return status code plus body so the user knows what went wrong.
return f"Request failed ({response.status_code}): {response.text}"
try:
data = response.json()
except Exception as exc:
return f"Invalid JSON response: {exc} | body: {response.text}"
try:
return data["choices"][0]["message"]["content"]
except Exception:
return f"Unexpected response format: {data}"
def call_openrouter_messages(messages) -> str:
"""Generic OpenRouter call that accepts a messages list."""
try:
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPEN_ROUTER_KEY}",
"Content-Type": "application/json"
},
data=json.dumps({
"model": OPEN_ROUTER_MODEL,
"messages": messages
}),
timeout=60,
)
except Exception as exc:
return f"Request error: {exc}"
if not response.ok:
return f"Request failed ({response.status_code}): {response.text}"
try:
data = response.json()
except Exception as exc:
return f"Invalid JSON response: {exc} | body: {response.text}"
try:
return data["choices"][0]["message"]["content"]
except Exception:
return f"Unexpected response format: {data}"
def ask_llm(question: str, schema_text: str) -> str:
"""Ask the model to generate a DuckDB SQL query for the given question and schema."""
messages = [
{
"role": "system",
"content": f"""
You are a data analyst.
You MUST use ONLY this table:
- Table name: data
Schema:
{schema_text}
Rules:
- Use ONLY table name "data"
- Return ONE valid DuckDB SQL query
- Do NOT explain
- Do NOT use markdown
"""
},
{"role": "user", "content": question},
]
return call_openrouter_messages(messages)
def explain_result(question: str, df: pd.DataFrame) -> str:
"""Ask the model to explain the result set in plain language."""
try:
result_text = df.to_string(index=False)
except Exception:
result_text = str(df)
messages = [
{
"role": "system",
"content": """
You are a data analyst.
Given a user's question and a query result,
produce a concise, human-like explanation.
Rules:
- Do NOT mention SQL, databases, or tables
- Do NOT explain how the data was computed
- Be clear and business-friendly
"""
},
{
"role": "user",
"content": f"""
Question:
{question}
Query Result:
{result_text}
"""
},
]
return call_openrouter_messages(messages)
def sanitize_dataframe(df: pd.DataFrame):
"""Return a copy of df with column names sanitized for SQL identifiers."""
if df is None or not isinstance(df, pd.DataFrame):
return df, {}
rename_map = {}
used = set()
for col in df.columns:
new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col))
new_col = new_col.strip("_")
if re.match(r"^[0-9]", new_col):
new_col = f"col_{new_col}"
if not new_col:
new_col = "col"
base = new_col
idx = 1
while new_col in used:
new_col = f"{base}_{idx}"
idx += 1
used.add(new_col)
rename_map[col] = new_col
return df.rename(columns=rename_map), rename_map
def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str:
"""Generate SQL via LLM, run it on DuckDB, and explain the result."""
if not question.strip():
return "Please enter a question."
if dataframe is None or not isinstance(dataframe, pd.DataFrame):
return "No CSV data loaded."
clean_df, rename_map = sanitize_dataframe(dataframe)
con = duckdb.connect()
try:
con.register("data", clean_df)
schema_df = con.execute("DESCRIBE data").fetch_df()
schema_text = schema_df.to_string(index=False)
sql = ask_llm(question, schema_text)
if not isinstance(sql, str):
return f"Unexpected SQL response: {sql}"
sql = sql.strip().strip(";")
sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE)
result_df = con.execute(sql).fetch_df()
except Exception as exc:
return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}"
finally:
con.close()
return explain_result(question, result_df)
def format_data_preview(data, max_chars: int = 12000) -> str:
"""Return a trimmed, human-friendly preview to keep prompts under token limits."""
if data is None:
return "No data loaded."
try:
if isinstance(data, pd.DataFrame):
preview = data.head(20).to_csv(index=False)
elif isinstance(data, list):
chunks = []
for doc in data[:5]:
text = getattr(doc, "page_content", str(doc))
if len(text) > 1500:
text = text[:1500] + "...[truncated]"
chunks.append(text)
preview = "\n\n".join(chunks)
else:
preview = str(data)
except Exception as exc:
preview = f"Could not format data preview: {exc}"
if len(preview) > max_chars:
preview = preview[:max_chars] + "...[truncated]"
return preview
def summarize_csv(dataframe: pd.DataFrame) -> str:
"""Build a compact summary (top items, payment mix) from a CSV DataFrame."""
if dataframe is None or not isinstance(dataframe, pd.DataFrame):
return ""
summary_lines = []
quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None)
desc_col = None
for candidate in ("Description", "Product", "Item", "Product_Name"):
if candidate in dataframe.columns:
desc_col = candidate
break
payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None)
if quantity_col and desc_col:
try:
top_items = (
dataframe.groupby(desc_col)[quantity_col]
.sum()
.sort_values(ascending=False)
.head(10)
)
summary_lines.append("Top items by quantity (sum):")
summary_lines.append(top_items.to_string())
except Exception as exc:
summary_lines.append(f"Could not compute top items: {exc}")
if payment_col:
try:
payment_counts = dataframe[payment_col].value_counts().head(10)
summary_lines.append("\nPayment method counts:")
summary_lines.append(payment_counts.to_string())
except Exception as exc:
summary_lines.append(f"Could not compute payment counts: {exc}")
return "\n".join(summary_lines)
def build_prompt(label: str, data, question: str, summary: str = "") -> str:
preview = format_data_preview(data)
summary_text = summary.strip()
summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else ""
return f"""Do not reply with a python code.
Data preview ({label}, truncated to avoid context limits):
{preview}
{summary_block}
User question: {question}
"""
def pretty_print_columns(text):
"""
Beautifies the provided CSV column description text.
Args:
text (str): The input string containing the column descriptions.
Returns:
str: The beautified string with neatly formatted column descriptions.
"""
return " ".join([line.strip() for line in text.splitlines() if line.strip()])
radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)",
"Upload my own CSV",
"Upload my own PDF",
f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})",
"Enter my own URL"]
# Visual polish
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap');
html, body, [class*="css"] {
font-family: 'Space Grotesk', system-ui, -apple-system, sans-serif;
}
.stApp {
background: radial-gradient(circle at 20% 20%, #e2f2ff 0, #f8fafc 50%, #ffffff 100%);
color: #0f172a;
}
.block-container {
padding-top: 1.5rem;
padding-bottom: 3rem;
padding-left: 2.5rem;
padding-right: 2.5rem;
}
.hero {
position: relative;
overflow: hidden;
padding: 1.5rem 1.8rem;
border-radius: 18px;
border: 1px solid #e5e7eb;
background: linear-gradient(135deg, rgba(59,130,246,0.12), rgba(16,185,129,0.08));
box-shadow: 0 18px 45px rgba(15, 23, 42, 0.12);
}
.hero:before {
content: "";
position: absolute;
right: -120px;
top: -80px;
width: 260px;
height: 260px;
background: radial-gradient(circle, rgba(59,130,246,0.15), transparent 55%);
filter: blur(6px);
}
.hero h1 {
margin: 0.15rem 0 0.35rem 0;
font-size: 2rem;
line-height: 1.2;
letter-spacing: -0.02em;
color: #0f172a;
}
.hero p {
color: #0f172a;
opacity: 0.9;
}
.eyebrow {
text-transform: uppercase;
letter-spacing: 0.14em;
font-size: 0.75rem;
font-weight: 700;
color: #0ea5e9;
margin: 0;
}
.pill-row {
display: flex;
gap: 0.5rem;
flex-wrap: wrap;
margin-top: 0.85rem;
}
.pill {
padding: 0.35rem 0.65rem;
border-radius: 10px;
background: rgba(15, 23, 42, 0.08);
font-size: 0.85rem;
font-weight: 600;
}
.section-label {
font-size: 0.85rem;
letter-spacing: 0.06em;
text-transform: uppercase;
color: #475569;
margin-bottom: 0.2rem;
font-weight: 700;
}
.section-card {
background: #ffffff;
border: 1px solid #e5e7eb;
border-radius: 16px;
padding: 1.1rem 1.2rem;
box-shadow: 0 12px 32px rgba(15, 23, 42, 0.08);
}
.section-card.compact {
padding: 0.9rem 1rem;
margin-top: 0.5rem;
}
.prompt-chip {
display: inline-flex;
align-items: center;
gap: 0.35rem;
padding: 0.5rem 0.75rem;
border-radius: 12px;
background: #0ea5e911;
border: 1px solid #bae6fd;
color: #0f172a;
font-weight: 600;
}
.status-pill {
display: inline-flex;
align-items: center;
gap: 0.35rem;
padding: 0.45rem 0.7rem;
border-radius: 999px;
border: 1px solid #e2e8f0;
font-weight: 600;
font-size: 0.9rem;
}
.status-pill.ready {
background: #ecfeff;
border-color: #a5f3fc;
color: #0f172a;
}
.status-pill.idle {
background: #f8fafc;
border-color: #e2e8f0;
color: #475569;
}
.stRadio div[role="radiogroup"] {
display: grid;
gap: 0.4rem;
}
.stRadio div[role="radio"] {
border: 1px solid #e2e8f0;
padding: 0.85rem 1rem;
border-radius: 12px;
background: #f8fafc;
transition: all 0.18s ease-in-out;
box-shadow: 0 8px 22px rgba(15, 23, 42, 0.05);
}
.stRadio div[role="radio"][aria-checked="true"] {
border-color: #2563eb;
background: #ffffff;
box-shadow: 0 18px 40px rgba(37, 99, 235, 0.15);
}
.stRadio div[role="radio"]:hover {
border-color: #3b82f6;
transform: translateY(-1px);
}
.stTextInput>div>div>input {
border-radius: 12px;
border: 1px solid #e2e8f0;
background: #ffffff;
padding: 0.75rem 0.85rem;
}
.stTextInput>div>div>input:focus {
border-color: #2563eb;
box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.15);
}
.stFileUploader {
border-radius: 14px;
border: 1px dashed #cbd5e1;
padding: 0.4rem 0.75rem 0.75rem 0.75rem;
background: #f8fafc;
}
.stButton>button {
background: linear-gradient(135deg, #2563eb, #0ea5e9);
color: #ffffff;
border: none;
padding: 0.75rem 1.35rem;
border-radius: 12px;
font-weight: 700;
letter-spacing: 0.02em;
box-shadow: 0 12px 30px rgba(14, 165, 233, 0.28);
transition: transform 0.12s ease, box-shadow 0.12s ease;
}
.stButton>button:hover {
transform: translateY(-1px);
box-shadow: 0 16px 38px rgba(37, 99, 235, 0.32);
}
.stButton>button:active {
transform: translateY(0);
}
.stButton>button:disabled {
background: #e2e8f0;
color: #94a3b8;
box-shadow: none;
}
</style>
""", unsafe_allow_html=True)
st.markdown("""
<div class="hero">
<p class="eyebrow">Document & Data Copilot</p>
<h1>Chat with your PDFs, spreadsheets, or live web pages.</h1>
<p>Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.</p>
<div class="pill-row">
<span class="pill">Summaries</span>
<span class="pill">Follow-up questions</span>
<span class="pill">Trends & metrics</span>
<span class="pill">Plain-language insights</span>
</div>
</div>
""", unsafe_allow_html=True)
info_left, info_right = st.columns([1.05, 1])
with info_left:
st.markdown("#### What you can do")
st.markdown("- Skim long PDFs in a few bullet points\n- Ask for top performers or outliers in CSVs\n- Pull key quotes or facts from a URL\n- Iterate with follow-up questions to refine")
with info_right:
st.markdown("#### Quick tips")
st.markdown("- Keep prompts short and specific\n- Mention the format you want (bullets, table, headline)\n- Ask one question at a time for best results\n- You can chain questions; context is remembered")
st.markdown("### Choose a source to explore")
genre = st.radio(
"Pick the content you want to chat with", radioButtonList, index=0, key="source_radio"
)
pdfCSVURLText = ""
exampleQuestion = ""
csv_data = None
pdf_pages = None
if genre==radioButtonList[1]:
pdfCSVURLText = "CSV"
exampleQuestion = "What are the data columns?"
elif genre==radioButtonList[2]:
pdfCSVURLText = "PDF"
exampleQuestion = "Can you summarize the contents?"
elif genre==radioButtonList[3]:
pdfCSVURLText = "URL"
exampleQuestion = "What is Google's latest earnings?"
elif genre==radioButtonList[4]:
pdfCSVURLText = "URL"
exampleQuestion = "Can you summarize the contents?"
else: # Default, E-commerce CSV
pdfCSVURLText = "CSV"
exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?"
if os.path.exists(DEFAULT_ECOMMERCE_CSV):
try:
csv_data = pd.read_csv(DEFAULT_ECOMMERCE_CSV)
except Exception as exc:
st.warning(f"Problem loading {DEFAULT_ECOMMERCE_CSV} ({exc}). Falling back to a small sample dataset.")
if csv_data is None:
# Keep a tiny inline sample so the app still works even when the CSV is missing locally.
csv_data = pd.DataFrame(
[
{"InvoiceNo": "536365", "StockCode": "85123A", "Description": "White hanging heart", "Quantity": 6, "UnitPrice": 2.55, "Country": "United Kingdom"},
{"InvoiceNo": "536366", "StockCode": "71053", "Description": "White metal lantern", "Quantity": 6, "UnitPrice": 3.39, "Country": "United Kingdom"},
{"InvoiceNo": "536367", "StockCode": "84406B", "Description": "Pink mini hanging heart", "Quantity": 8, "UnitPrice": 1.65, "Country": "United Kingdom"},
]
)
st.info(f"{DEFAULT_ECOMMERCE_CSV} not found. Using an inline sample instead. Upload your own CSV if you need the full dataset.")
st.markdown("### Add your data")
st.caption("Upload a CSV/PDF or paste a URL. The built-in e-commerce sample is ready immediately.")
if exampleQuestion:
st.markdown(
f"""
<div class="section-card compact">
<div class="section-label">Suggested prompt</div>
<div class="prompt-chip">{exampleQuestion}</div>
</div>
""",
unsafe_allow_html=True,
)
isCustomURL = genre==radioButtonList[4]
urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL)
isCustomUpload = genre==radioButtonList[1] or genre==radioButtonList[2]
uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomUpload)
uploadedFilename = ""
if uploaded_file is not None:
if genre==radioButtonList[1]: # Custom CSV Upload
try:
csv_data = pd.read_csv(uploaded_file)
except Exception as exc:
st.error(f"Could not read uploaded CSV: {exc}")
elif genre==radioButtonList[2]: # Custom PDF Upload
with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}', delete=False) as f:
f.write(uploaded_file.getbuffer())
uploadedFilename = f.name
try:
loader = PyPDFLoader(uploadedFilename)
pdf_pages = loader.load_and_split()
except Exception as exc:
st.error(f"Could not read uploaded PDF: {exc}")
finally:
if uploadedFilename and os.path.exists(uploadedFilename):
os.remove(uploadedFilename)
enableChatBox = False
if genre==radioButtonList[1]: # Custom CSV Upload
enableChatBox = isinstance(csv_data, pd.DataFrame)
elif genre==radioButtonList[2]: # Custom PDF Upload
enableChatBox = pdf_pages is not None
elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
enableChatBox = True
elif genre==radioButtonList[4]: # Custom URL
enableChatBox = True
else: # E-commerce CSV
enableChatBox = True
status_class = "ready" if enableChatBox else "idle"
status_text = "Ready to chat" if enableChatBox else "Load a file or URL to start"
st.markdown(f'<div class="status-pill {status_class}">{status_text}</div>', unsafe_allow_html=True)
st.markdown("### Ask a question")
st.caption("Short, specific prompts work best. You can ask follow-ups without reloading.")
chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox)
chatWithPDFButton = "CLICK HERE TO START CHATTING"
if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked
if genre==radioButtonList[0]: # E-commerce CSV
st.write(run_duckdb_qa(chatTextStr, csv_data))
elif genre==radioButtonList[1]: # Custom CSV Upload
st.write(run_duckdb_qa(chatTextStr, csv_data))
elif genre==radioButtonList[2]: # Custom PDF Upload
content = build_prompt("Uploaded PDF", pdf_pages, chatTextStr)
st.write(call_openrouter(content))
elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
loader = WebBaseLoader(defaultGoogleURL)
web_data = loader.load()
content = build_prompt("Google earnings URL", web_data, chatTextStr)
st.write(call_openrouter(content))
elif genre==radioButtonList[4]: # Custom URL
if not urlInput.strip():
st.warning("Please enter a URL first.")
else:
loader = WebBaseLoader(urlInput)
web_data = loader.load()
content = build_prompt("Custom URL", web_data, chatTextStr)
st.write(call_openrouter(content))