Spaces:

markytools
/

pdfchat

Sleeping

App Files Files Community

pdfchat / app.py

markytools

beautified the UI

2f04502 about 1 month ago

raw

history blame contribute delete

21 kB

	import streamlit as st
	from tempfile import NamedTemporaryFile

	# Page-level look and feel
	st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide")

	import pprint
	import re
	import os
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.document_loaders.csv_loader import CSVLoader
	from langchain.document_loaders import WebBaseLoader
	import duckdb

	import pandas as pd
	import numpy as np
	import pprint
	import requests
	import json

	defaultGoogleURL = "https://www.google.com/search?q=google+earnings"
	OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free"
	DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv"

	# Input for OpenRouter API Key
	OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"]

	if not OPEN_ROUTER_KEY:
	st.warning("Please enter your OpenRouter API Key to proceed.")
	st.stop()

	def call_openrouter(content: str) -> str:
	"""Send a chat request to OpenRouter and return a safe string response."""
	try:
	response = requests.post(
	url="https://openrouter.ai/api/v1/chat/completions",
	headers={
	"Authorization": f"Bearer {OPEN_ROUTER_KEY}",
	"Content-Type": "application/json"
	},
	data=json.dumps({
	"model": OPEN_ROUTER_MODEL,
	"messages": [
	{
	"role": "user",
	"content": content
	}
	]
	}),
	timeout=60,
	)
	except Exception as exc:
	return f"Request error: {exc}"

	if not response.ok:
	# Return status code plus body so the user knows what went wrong.
	return f"Request failed ({response.status_code}): {response.text}"

	try:
	data = response.json()
	except Exception as exc:
	return f"Invalid JSON response: {exc} \| body: {response.text}"

	try:
	return data["choices"][0]["message"]["content"]
	except Exception:
	return f"Unexpected response format: {data}"

	def call_openrouter_messages(messages) -> str:
	"""Generic OpenRouter call that accepts a messages list."""
	try:
	response = requests.post(
	url="https://openrouter.ai/api/v1/chat/completions",
	headers={
	"Authorization": f"Bearer {OPEN_ROUTER_KEY}",
	"Content-Type": "application/json"
	},
	data=json.dumps({
	"model": OPEN_ROUTER_MODEL,
	"messages": messages
	}),
	timeout=60,
	)
	except Exception as exc:
	return f"Request error: {exc}"

	if not response.ok:
	return f"Request failed ({response.status_code}): {response.text}"

	try:
	data = response.json()
	except Exception as exc:
	return f"Invalid JSON response: {exc} \| body: {response.text}"

	try:
	return data["choices"][0]["message"]["content"]
	except Exception:
	return f"Unexpected response format: {data}"

	def ask_llm(question: str, schema_text: str) -> str:
	"""Ask the model to generate a DuckDB SQL query for the given question and schema."""
	messages = [
	{
	"role": "system",
	"content": f"""
	You are a data analyst.

	You MUST use ONLY this table:
	- Table name: data

	Schema:
	{schema_text}

	Rules:
	- Use ONLY table name "data"
	- Return ONE valid DuckDB SQL query
	- Do NOT explain
	- Do NOT use markdown
	"""
	},
	{"role": "user", "content": question},
	]
	return call_openrouter_messages(messages)

	def explain_result(question: str, df: pd.DataFrame) -> str:
	"""Ask the model to explain the result set in plain language."""
	try:
	result_text = df.to_string(index=False)
	except Exception:
	result_text = str(df)

	messages = [
	{
	"role": "system",
	"content": """
	You are a data analyst.

	Given a user's question and a query result,
	produce a concise, human-like explanation.

	Rules:
	- Do NOT mention SQL, databases, or tables
	- Do NOT explain how the data was computed
	- Be clear and business-friendly
	"""
	},
	{
	"role": "user",
	"content": f"""
	Question:
	{question}

	Query Result:
	{result_text}
	"""
	},
	]
	return call_openrouter_messages(messages)

	def sanitize_dataframe(df: pd.DataFrame):
	"""Return a copy of df with column names sanitized for SQL identifiers."""
	if df is None or not isinstance(df, pd.DataFrame):
	return df, {}
	rename_map = {}
	used = set()
	for col in df.columns:
	new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col))
	new_col = new_col.strip("_")
	if re.match(r"^[0-9]", new_col):
	new_col = f"col_{new_col}"
	if not new_col:
	new_col = "col"
	base = new_col
	idx = 1
	while new_col in used:
	new_col = f"{base}_{idx}"
	idx += 1
	used.add(new_col)
	rename_map[col] = new_col
	return df.rename(columns=rename_map), rename_map

	def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str:
	"""Generate SQL via LLM, run it on DuckDB, and explain the result."""
	if not question.strip():
	return "Please enter a question."
	if dataframe is None or not isinstance(dataframe, pd.DataFrame):
	return "No CSV data loaded."
	clean_df, rename_map = sanitize_dataframe(dataframe)
	con = duckdb.connect()
	try:
	con.register("data", clean_df)
	schema_df = con.execute("DESCRIBE data").fetch_df()
	schema_text = schema_df.to_string(index=False)
	sql = ask_llm(question, schema_text)
	if not isinstance(sql, str):
	return f"Unexpected SQL response: {sql}"
	sql = sql.strip().strip(";")
	sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE)
	result_df = con.execute(sql).fetch_df()
	except Exception as exc:
	return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}"
	finally:
	con.close()

	return explain_result(question, result_df)

	def format_data_preview(data, max_chars: int = 12000) -> str:
	"""Return a trimmed, human-friendly preview to keep prompts under token limits."""
	if data is None:
	return "No data loaded."
	try:
	if isinstance(data, pd.DataFrame):
	preview = data.head(20).to_csv(index=False)
	elif isinstance(data, list):
	chunks = []
	for doc in data[:5]:
	text = getattr(doc, "page_content", str(doc))
	if len(text) > 1500:
	text = text[:1500] + "...[truncated]"
	chunks.append(text)
	preview = "\n\n".join(chunks)
	else:
	preview = str(data)
	except Exception as exc:
	preview = f"Could not format data preview: {exc}"
	if len(preview) > max_chars:
	preview = preview[:max_chars] + "...[truncated]"
	return preview

	def summarize_csv(dataframe: pd.DataFrame) -> str:
	"""Build a compact summary (top items, payment mix) from a CSV DataFrame."""
	if dataframe is None or not isinstance(dataframe, pd.DataFrame):
	return ""
	summary_lines = []

	quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None)
	desc_col = None
	for candidate in ("Description", "Product", "Item", "Product_Name"):
	if candidate in dataframe.columns:
	desc_col = candidate
	break
	payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None)

	if quantity_col and desc_col:
	try:
	top_items = (
	dataframe.groupby(desc_col)[quantity_col]
	.sum()
	.sort_values(ascending=False)
	.head(10)
	)
	summary_lines.append("Top items by quantity (sum):")
	summary_lines.append(top_items.to_string())
	except Exception as exc:
	summary_lines.append(f"Could not compute top items: {exc}")

	if payment_col:
	try:
	payment_counts = dataframe[payment_col].value_counts().head(10)
	summary_lines.append("\nPayment method counts:")
	summary_lines.append(payment_counts.to_string())
	except Exception as exc:
	summary_lines.append(f"Could not compute payment counts: {exc}")

	return "\n".join(summary_lines)

	def build_prompt(label: str, data, question: str, summary: str = "") -> str:
	preview = format_data_preview(data)
	summary_text = summary.strip()
	summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else ""
	return f"""Do not reply with a python code.
	Data preview ({label}, truncated to avoid context limits):

	{preview}

	{summary_block}
	User question: {question}
	"""

	def pretty_print_columns(text):
	"""
	Beautifies the provided CSV column description text.

	Args:
	text (str): The input string containing the column descriptions.

	Returns:
	str: The beautified string with neatly formatted column descriptions.
	"""
	return " ".join([line.strip() for line in text.splitlines() if line.strip()])

	radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)",
	"Upload my own CSV",
	"Upload my own PDF",
	f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})",
	"Enter my own URL"]

	# Visual polish
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap');

	html, body, [class*="css"] {
	font-family: 'Space Grotesk', system-ui, -apple-system, sans-serif;
	}

	.stApp {
	background: radial-gradient(circle at 20% 20%, #e2f2ff 0, #f8fafc 50%, #ffffff 100%);
	color: #0f172a;
	}

	.block-container {
	padding-top: 1.5rem;
	padding-bottom: 3rem;
	padding-left: 2.5rem;
	padding-right: 2.5rem;
	}

	.hero {
	position: relative;
	overflow: hidden;
	padding: 1.5rem 1.8rem;
	border-radius: 18px;
	border: 1px solid #e5e7eb;
	background: linear-gradient(135deg, rgba(59,130,246,0.12), rgba(16,185,129,0.08));
	box-shadow: 0 18px 45px rgba(15, 23, 42, 0.12);
	}

	.hero:before {
	content: "";
	position: absolute;
	right: -120px;
	top: -80px;
	width: 260px;
	height: 260px;
	background: radial-gradient(circle, rgba(59,130,246,0.15), transparent 55%);
	filter: blur(6px);
	}

	.hero h1 {
	margin: 0.15rem 0 0.35rem 0;
	font-size: 2rem;
	line-height: 1.2;
	letter-spacing: -0.02em;
	color: #0f172a;
	}

	.hero p {
	color: #0f172a;
	opacity: 0.9;
	}

	.eyebrow {
	text-transform: uppercase;
	letter-spacing: 0.14em;
	font-size: 0.75rem;
	font-weight: 700;
	color: #0ea5e9;
	margin: 0;
	}

	.pill-row {
	display: flex;
	gap: 0.5rem;
	flex-wrap: wrap;
	margin-top: 0.85rem;
	}

	.pill {
	padding: 0.35rem 0.65rem;
	border-radius: 10px;
	background: rgba(15, 23, 42, 0.08);
	font-size: 0.85rem;
	font-weight: 600;
	}

	.section-label {
	font-size: 0.85rem;
	letter-spacing: 0.06em;
	text-transform: uppercase;
	color: #475569;
	margin-bottom: 0.2rem;
	font-weight: 700;
	}

	.section-card {
	background: #ffffff;
	border: 1px solid #e5e7eb;
	border-radius: 16px;
	padding: 1.1rem 1.2rem;
	box-shadow: 0 12px 32px rgba(15, 23, 42, 0.08);
	}

	.section-card.compact {
	padding: 0.9rem 1rem;
	margin-top: 0.5rem;
	}

	.prompt-chip {
	display: inline-flex;
	align-items: center;
	gap: 0.35rem;
	padding: 0.5rem 0.75rem;
	border-radius: 12px;
	background: #0ea5e911;
	border: 1px solid #bae6fd;
	color: #0f172a;
	font-weight: 600;
	}

	.status-pill {
	display: inline-flex;
	align-items: center;
	gap: 0.35rem;
	padding: 0.45rem 0.7rem;
	border-radius: 999px;
	border: 1px solid #e2e8f0;
	font-weight: 600;
	font-size: 0.9rem;
	}

	.status-pill.ready {
	background: #ecfeff;
	border-color: #a5f3fc;
	color: #0f172a;
	}

	.status-pill.idle {
	background: #f8fafc;
	border-color: #e2e8f0;
	color: #475569;
	}

	.stRadio div[role="radiogroup"] {
	display: grid;
	gap: 0.4rem;
	}

	.stRadio div[role="radio"] {
	border: 1px solid #e2e8f0;
	padding: 0.85rem 1rem;
	border-radius: 12px;
	background: #f8fafc;
	transition: all 0.18s ease-in-out;
	box-shadow: 0 8px 22px rgba(15, 23, 42, 0.05);
	}

	.stRadio div[role="radio"][aria-checked="true"] {
	border-color: #2563eb;
	background: #ffffff;
	box-shadow: 0 18px 40px rgba(37, 99, 235, 0.15);
	}

	.stRadio div[role="radio"]:hover {
	border-color: #3b82f6;
	transform: translateY(-1px);
	}

	.stTextInput>div>div>input {
	border-radius: 12px;
	border: 1px solid #e2e8f0;
	background: #ffffff;
	padding: 0.75rem 0.85rem;
	}

	.stTextInput>div>div>input:focus {
	border-color: #2563eb;
	box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.15);
	}

	.stFileUploader {
	border-radius: 14px;
	border: 1px dashed #cbd5e1;
	padding: 0.4rem 0.75rem 0.75rem 0.75rem;
	background: #f8fafc;
	}

	.stButton>button {
	background: linear-gradient(135deg, #2563eb, #0ea5e9);
	color: #ffffff;
	border: none;
	padding: 0.75rem 1.35rem;
	border-radius: 12px;
	font-weight: 700;
	letter-spacing: 0.02em;
	box-shadow: 0 12px 30px rgba(14, 165, 233, 0.28);
	transition: transform 0.12s ease, box-shadow 0.12s ease;
	}

	.stButton>button:hover {
	transform: translateY(-1px);
	box-shadow: 0 16px 38px rgba(37, 99, 235, 0.32);
	}

	.stButton>button:active {
	transform: translateY(0);
	}

	.stButton>button:disabled {
	background: #e2e8f0;
	color: #94a3b8;
	box-shadow: none;
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="hero">
	<p class="eyebrow">Document & Data Copilot</p>
	<h1>Chat with your PDFs, spreadsheets, or live web pages.</h1>
	<p>Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.</p>
	<div class="pill-row">
	<span class="pill">Summaries</span>
	<span class="pill">Follow-up questions</span>
	<span class="pill">Trends & metrics</span>
	<span class="pill">Plain-language insights</span>
	</div>
	</div>
	""", unsafe_allow_html=True)

	info_left, info_right = st.columns([1.05, 1])
	with info_left:
	st.markdown("#### What you can do")
	st.markdown("- Skim long PDFs in a few bullet points\n- Ask for top performers or outliers in CSVs\n- Pull key quotes or facts from a URL\n- Iterate with follow-up questions to refine")

	with info_right:
	st.markdown("#### Quick tips")
	st.markdown("- Keep prompts short and specific\n- Mention the format you want (bullets, table, headline)\n- Ask one question at a time for best results\n- You can chain questions; context is remembered")

	st.markdown("### Choose a source to explore")
	genre = st.radio(
	"Pick the content you want to chat with", radioButtonList, index=0, key="source_radio"
	)

	pdfCSVURLText = ""
	exampleQuestion = ""
	csv_data = None
	pdf_pages = None

	if genre==radioButtonList[1]:
	pdfCSVURLText = "CSV"
	exampleQuestion = "What are the data columns?"
	elif genre==radioButtonList[2]:
	pdfCSVURLText = "PDF"
	exampleQuestion = "Can you summarize the contents?"
	elif genre==radioButtonList[3]:
	pdfCSVURLText = "URL"
	exampleQuestion = "What is Google's latest earnings?"
	elif genre==radioButtonList[4]:
	pdfCSVURLText = "URL"
	exampleQuestion = "Can you summarize the contents?"
	else: # Default, E-commerce CSV
	pdfCSVURLText = "CSV"
	exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?"
	if os.path.exists(DEFAULT_ECOMMERCE_CSV):
	try:
	csv_data = pd.read_csv(DEFAULT_ECOMMERCE_CSV)
	except Exception as exc:
	st.warning(f"Problem loading {DEFAULT_ECOMMERCE_CSV} ({exc}). Falling back to a small sample dataset.")
	if csv_data is None:
	# Keep a tiny inline sample so the app still works even when the CSV is missing locally.
	csv_data = pd.DataFrame(
	[
	{"InvoiceNo": "536365", "StockCode": "85123A", "Description": "White hanging heart", "Quantity": 6, "UnitPrice": 2.55, "Country": "United Kingdom"},
	{"InvoiceNo": "536366", "StockCode": "71053", "Description": "White metal lantern", "Quantity": 6, "UnitPrice": 3.39, "Country": "United Kingdom"},
	{"InvoiceNo": "536367", "StockCode": "84406B", "Description": "Pink mini hanging heart", "Quantity": 8, "UnitPrice": 1.65, "Country": "United Kingdom"},
	]
	)
	st.info(f"{DEFAULT_ECOMMERCE_CSV} not found. Using an inline sample instead. Upload your own CSV if you need the full dataset.")

	st.markdown("### Add your data")
	st.caption("Upload a CSV/PDF or paste a URL. The built-in e-commerce sample is ready immediately.")
	if exampleQuestion:
	st.markdown(
	f"""
	<div class="section-card compact">
	<div class="section-label">Suggested prompt</div>
	<div class="prompt-chip">{exampleQuestion}</div>
	</div>
	""",
	unsafe_allow_html=True,
	)

	isCustomURL = genre==radioButtonList[4]
	urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL)

	isCustomUpload = genre==radioButtonList[1] or genre==radioButtonList[2]
	uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomUpload)
	uploadedFilename = ""
	if uploaded_file is not None:
	if genre==radioButtonList[1]: # Custom CSV Upload
	try:
	csv_data = pd.read_csv(uploaded_file)
	except Exception as exc:
	st.error(f"Could not read uploaded CSV: {exc}")
	elif genre==radioButtonList[2]: # Custom PDF Upload
	with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}', delete=False) as f:
	f.write(uploaded_file.getbuffer())
	uploadedFilename = f.name
	try:
	loader = PyPDFLoader(uploadedFilename)
	pdf_pages = loader.load_and_split()
	except Exception as exc:
	st.error(f"Could not read uploaded PDF: {exc}")
	finally:
	if uploadedFilename and os.path.exists(uploadedFilename):
	os.remove(uploadedFilename)

	enableChatBox = False
	if genre==radioButtonList[1]: # Custom CSV Upload
	enableChatBox = isinstance(csv_data, pd.DataFrame)
	elif genre==radioButtonList[2]: # Custom PDF Upload
	enableChatBox = pdf_pages is not None
	elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
	enableChatBox = True
	elif genre==radioButtonList[4]: # Custom URL
	enableChatBox = True
	else: # E-commerce CSV
	enableChatBox = True

	status_class = "ready" if enableChatBox else "idle"
	status_text = "Ready to chat" if enableChatBox else "Load a file or URL to start"
	st.markdown(f'<div class="status-pill {status_class}">{status_text}</div>', unsafe_allow_html=True)

	st.markdown("### Ask a question")
	st.caption("Short, specific prompts work best. You can ask follow-ups without reloading.")
	chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox)
	chatWithPDFButton = "CLICK HERE TO START CHATTING"
	if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): # Button Cliked
	if genre==radioButtonList[0]: # E-commerce CSV
	st.write(run_duckdb_qa(chatTextStr, csv_data))

	elif genre==radioButtonList[1]: # Custom CSV Upload
	st.write(run_duckdb_qa(chatTextStr, csv_data))

	elif genre==radioButtonList[2]: # Custom PDF Upload
	content = build_prompt("Uploaded PDF", pdf_pages, chatTextStr)
	st.write(call_openrouter(content))
	elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
	loader = WebBaseLoader(defaultGoogleURL)
	web_data = loader.load()
	content = build_prompt("Google earnings URL", web_data, chatTextStr)
	st.write(call_openrouter(content))
	elif genre==radioButtonList[4]: # Custom URL
	if not urlInput.strip():
	st.warning("Please enter a URL first.")
	else:
	loader = WebBaseLoader(urlInput)
	web_data = loader.load()
	content = build_prompt("Custom URL", web_data, chatTextStr)
	st.write(call_openrouter(content))