Spaces:

markytools
/

pdfchat

Sleeping

File size: 20,975 Bytes

import streamlit as st
from tempfile import NamedTemporaryFile

# Page-level look and feel
st.set_page_config(page_title="Document & Data Copilot", page_icon="💬", layout="wide")

import pprint
import re
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import WebBaseLoader
import duckdb

import pandas as pd
import numpy as np
import pprint
import requests
import json

defaultGoogleURL = "https://www.google.com/search?q=google+earnings"
OPEN_ROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free"
DEFAULT_ECOMMERCE_CSV = "EcommerceDataset.csv"

# Input for OpenRouter API Key
OPEN_ROUTER_KEY = st.secrets["OPEN_ROUTER_KEY"]

if not OPEN_ROUTER_KEY:
    st.warning("Please enter your OpenRouter API Key to proceed.")
    st.stop()

def call_openrouter(content: str) -> str:
    """Send a chat request to OpenRouter and return a safe string response."""
    try:
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPEN_ROUTER_KEY}",
                "Content-Type": "application/json"
            },
            data=json.dumps({
                "model": OPEN_ROUTER_MODEL,
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ]
            }),
            timeout=60,
        )
    except Exception as exc:
        return f"Request error: {exc}"

    if not response.ok:
        # Return status code plus body so the user knows what went wrong.
        return f"Request failed ({response.status_code}): {response.text}"

    try:
        data = response.json()
    except Exception as exc:
        return f"Invalid JSON response: {exc} | body: {response.text}"

    try:
        return data["choices"][0]["message"]["content"]
    except Exception:
        return f"Unexpected response format: {data}"

def call_openrouter_messages(messages) -> str:
    """Generic OpenRouter call that accepts a messages list."""
    try:
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPEN_ROUTER_KEY}",
                "Content-Type": "application/json"
            },
            data=json.dumps({
                "model": OPEN_ROUTER_MODEL,
                "messages": messages
            }),
            timeout=60,
        )
    except Exception as exc:
        return f"Request error: {exc}"

    if not response.ok:
        return f"Request failed ({response.status_code}): {response.text}"

    try:
        data = response.json()
    except Exception as exc:
        return f"Invalid JSON response: {exc} | body: {response.text}"

    try:
        return data["choices"][0]["message"]["content"]
    except Exception:
        return f"Unexpected response format: {data}"

def ask_llm(question: str, schema_text: str) -> str:
    """Ask the model to generate a DuckDB SQL query for the given question and schema."""
    messages = [
        {
            "role": "system",
            "content": f"""

You are a data analyst.



You MUST use ONLY this table:

- Table name: data



Schema:

{schema_text}



Rules:

- Use ONLY table name "data"

- Return ONE valid DuckDB SQL query

- Do NOT explain

- Do NOT use markdown

"""
        },
        {"role": "user", "content": question},
    ]
    return call_openrouter_messages(messages)

def explain_result(question: str, df: pd.DataFrame) -> str:
    """Ask the model to explain the result set in plain language."""
    try:
        result_text = df.to_string(index=False)
    except Exception:
        result_text = str(df)

    messages = [
        {
            "role": "system",
            "content": """

You are a data analyst.



Given a user's question and a query result,

produce a concise, human-like explanation.



Rules:

- Do NOT mention SQL, databases, or tables

- Do NOT explain how the data was computed

- Be clear and business-friendly

"""
        },
        {
            "role": "user",
            "content": f"""

Question:

{question}



Query Result:

{result_text}

"""
        },
    ]
    return call_openrouter_messages(messages)

def sanitize_dataframe(df: pd.DataFrame):
    """Return a copy of df with column names sanitized for SQL identifiers."""
    if df is None or not isinstance(df, pd.DataFrame):
        return df, {}
    rename_map = {}
    used = set()
    for col in df.columns:
        new_col = re.sub(r"[^0-9a-zA-Z_]+", "_", str(col))
        new_col = new_col.strip("_")
        if re.match(r"^[0-9]", new_col):
            new_col = f"col_{new_col}"
        if not new_col:
            new_col = "col"
        base = new_col
        idx = 1
        while new_col in used:
            new_col = f"{base}_{idx}"
            idx += 1
        used.add(new_col)
        rename_map[col] = new_col
    return df.rename(columns=rename_map), rename_map

def run_duckdb_qa(question: str, dataframe: pd.DataFrame) -> str:
    """Generate SQL via LLM, run it on DuckDB, and explain the result."""
    if not question.strip():
        return "Please enter a question."
    if dataframe is None or not isinstance(dataframe, pd.DataFrame):
        return "No CSV data loaded."
    clean_df, rename_map = sanitize_dataframe(dataframe)
    con = duckdb.connect()
    try:
        con.register("data", clean_df)
        schema_df = con.execute("DESCRIBE data").fetch_df()
        schema_text = schema_df.to_string(index=False)
        sql = ask_llm(question, schema_text)
        if not isinstance(sql, str):
            return f"Unexpected SQL response: {sql}"
        sql = sql.strip().strip(";")
        sql = re.sub(r"\bSTDEV\s*\(", "STDDEV(", sql, flags=re.IGNORECASE)
        result_df = con.execute(sql).fetch_df()
    except Exception as exc:
        return f"SQL error: {exc}\nSQL used:\n{locals().get('sql', 'N/A')}"
    finally:
        con.close()

    return explain_result(question, result_df)

def format_data_preview(data, max_chars: int = 12000) -> str:
    """Return a trimmed, human-friendly preview to keep prompts under token limits."""
    if data is None:
        return "No data loaded."
    try:
        if isinstance(data, pd.DataFrame):
            preview = data.head(20).to_csv(index=False)
        elif isinstance(data, list):
            chunks = []
            for doc in data[:5]:
                text = getattr(doc, "page_content", str(doc))
                if len(text) > 1500:
                    text = text[:1500] + "...[truncated]"
                chunks.append(text)
            preview = "\n\n".join(chunks)
        else:
            preview = str(data)
    except Exception as exc:
        preview = f"Could not format data preview: {exc}"
    if len(preview) > max_chars:
        preview = preview[:max_chars] + "...[truncated]"
    return preview

def summarize_csv(dataframe: pd.DataFrame) -> str:
    """Build a compact summary (top items, payment mix) from a CSV DataFrame."""
    if dataframe is None or not isinstance(dataframe, pd.DataFrame):
        return ""
    summary_lines = []

    quantity_col = next((c for c in dataframe.columns if c.lower().startswith("quantity")), None)
    desc_col = None
    for candidate in ("Description", "Product", "Item", "Product_Name"):
        if candidate in dataframe.columns:
            desc_col = candidate
            break
    payment_col = next((c for c in dataframe.columns if "payment" in c.lower()), None)

    if quantity_col and desc_col:
        try:
            top_items = (
                dataframe.groupby(desc_col)[quantity_col]
                .sum()
                .sort_values(ascending=False)
                .head(10)
            )
            summary_lines.append("Top items by quantity (sum):")
            summary_lines.append(top_items.to_string())
        except Exception as exc:
            summary_lines.append(f"Could not compute top items: {exc}")

    if payment_col:
        try:
            payment_counts = dataframe[payment_col].value_counts().head(10)
            summary_lines.append("\nPayment method counts:")
            summary_lines.append(payment_counts.to_string())
        except Exception as exc:
            summary_lines.append(f"Could not compute payment counts: {exc}")

    return "\n".join(summary_lines)

def build_prompt(label: str, data, question: str, summary: str = "") -> str:
    preview = format_data_preview(data)
    summary_text = summary.strip()
    summary_block = f"\nData summary:\n{summary_text}\n" if summary_text else ""
    return f"""Do not reply with a python code.

Data preview ({label}, truncated to avoid context limits):



{preview}



{summary_block}

User question: {question}

"""

def pretty_print_columns(text):
    """

    Beautifies the provided CSV column description text.

    

    Args:

        text (str): The input string containing the column descriptions.

    

    Returns:

        str: The beautified string with neatly formatted column descriptions.

    """
    return "  ".join([line.strip() for line in text.splitlines() if line.strip()])

radioButtonList = ["E-commerce CSV (https://www.kaggle.com/datasets/mervemenekse/ecommerce-dataset)",
"Upload my own CSV",
"Upload my own PDF",
f"URL Chat with Google's Latest Earnings ({defaultGoogleURL})",
"Enter my own URL"]

# Visual polish
st.markdown("""

<style>

@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&display=swap');



html, body, [class*="css"] {

  font-family: 'Space Grotesk', system-ui, -apple-system, sans-serif;

}



.stApp {

  background: radial-gradient(circle at 20% 20%, #e2f2ff 0, #f8fafc 50%, #ffffff 100%);

  color: #0f172a;

}



.block-container {

  padding-top: 1.5rem;

  padding-bottom: 3rem;

  padding-left: 2.5rem;

  padding-right: 2.5rem;

}



.hero {

  position: relative;

  overflow: hidden;

  padding: 1.5rem 1.8rem;

  border-radius: 18px;

  border: 1px solid #e5e7eb;

  background: linear-gradient(135deg, rgba(59,130,246,0.12), rgba(16,185,129,0.08));

  box-shadow: 0 18px 45px rgba(15, 23, 42, 0.12);

}



.hero:before {

  content: "";

  position: absolute;

  right: -120px;

  top: -80px;

  width: 260px;

  height: 260px;

  background: radial-gradient(circle, rgba(59,130,246,0.15), transparent 55%);

  filter: blur(6px);

}



.hero h1 {

  margin: 0.15rem 0 0.35rem 0;

  font-size: 2rem;

  line-height: 1.2;

  letter-spacing: -0.02em;

  color: #0f172a;

}



.hero p {

  color: #0f172a;

  opacity: 0.9;

}



.eyebrow {

  text-transform: uppercase;

  letter-spacing: 0.14em;

  font-size: 0.75rem;

  font-weight: 700;

  color: #0ea5e9;

  margin: 0;

}



.pill-row {

  display: flex;

  gap: 0.5rem;

  flex-wrap: wrap;

  margin-top: 0.85rem;

}



.pill {

  padding: 0.35rem 0.65rem;

  border-radius: 10px;

  background: rgba(15, 23, 42, 0.08);

  font-size: 0.85rem;

  font-weight: 600;

}



.section-label {

  font-size: 0.85rem;

  letter-spacing: 0.06em;

  text-transform: uppercase;

  color: #475569;

  margin-bottom: 0.2rem;

  font-weight: 700;

}



.section-card {

  background: #ffffff;

  border: 1px solid #e5e7eb;

  border-radius: 16px;

  padding: 1.1rem 1.2rem;

  box-shadow: 0 12px 32px rgba(15, 23, 42, 0.08);

}



.section-card.compact {

  padding: 0.9rem 1rem;

  margin-top: 0.5rem;

}



.prompt-chip {

  display: inline-flex;

  align-items: center;

  gap: 0.35rem;

  padding: 0.5rem 0.75rem;

  border-radius: 12px;

  background: #0ea5e911;

  border: 1px solid #bae6fd;

  color: #0f172a;

  font-weight: 600;

}



.status-pill {

  display: inline-flex;

  align-items: center;

  gap: 0.35rem;

  padding: 0.45rem 0.7rem;

  border-radius: 999px;

  border: 1px solid #e2e8f0;

  font-weight: 600;

  font-size: 0.9rem;

}



.status-pill.ready {

  background: #ecfeff;

  border-color: #a5f3fc;

  color: #0f172a;

}



.status-pill.idle {

  background: #f8fafc;

  border-color: #e2e8f0;

  color: #475569;

}



.stRadio div[role="radiogroup"] {

  display: grid;

  gap: 0.4rem;

}



.stRadio div[role="radio"] {

  border: 1px solid #e2e8f0;

  padding: 0.85rem 1rem;

  border-radius: 12px;

  background: #f8fafc;

  transition: all 0.18s ease-in-out;

  box-shadow: 0 8px 22px rgba(15, 23, 42, 0.05);

}



.stRadio div[role="radio"][aria-checked="true"] {

  border-color: #2563eb;

  background: #ffffff;

  box-shadow: 0 18px 40px rgba(37, 99, 235, 0.15);

}



.stRadio div[role="radio"]:hover {

  border-color: #3b82f6;

  transform: translateY(-1px);

}



.stTextInput>div>div>input {

  border-radius: 12px;

  border: 1px solid #e2e8f0;

  background: #ffffff;

  padding: 0.75rem 0.85rem;

}



.stTextInput>div>div>input:focus {

  border-color: #2563eb;

  box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.15);

}



.stFileUploader {

  border-radius: 14px;

  border: 1px dashed #cbd5e1;

  padding: 0.4rem 0.75rem 0.75rem 0.75rem;

  background: #f8fafc;

}



.stButton>button {

  background: linear-gradient(135deg, #2563eb, #0ea5e9);

  color: #ffffff;

  border: none;

  padding: 0.75rem 1.35rem;

  border-radius: 12px;

  font-weight: 700;

  letter-spacing: 0.02em;

  box-shadow: 0 12px 30px rgba(14, 165, 233, 0.28);

  transition: transform 0.12s ease, box-shadow 0.12s ease;

}



.stButton>button:hover {

  transform: translateY(-1px);

  box-shadow: 0 16px 38px rgba(37, 99, 235, 0.32);

}



.stButton>button:active {

  transform: translateY(0);

}



.stButton>button:disabled {

  background: #e2e8f0;

  color: #94a3b8;

  box-shadow: none;

}

</style>

""", unsafe_allow_html=True)

st.markdown("""

<div class="hero">

  <p class="eyebrow">Document & Data Copilot</p>

  <h1>Chat with your PDFs, spreadsheets, or live web pages.</h1>

  <p>Upload or pick a preset, ask a focused question, and get a clear answer without digging through the source yourself.</p>

  <div class="pill-row">

    <span class="pill">Summaries</span>

    <span class="pill">Follow-up questions</span>

    <span class="pill">Trends & metrics</span>

    <span class="pill">Plain-language insights</span>

  </div>

</div>

""", unsafe_allow_html=True)

info_left, info_right = st.columns([1.05, 1])
with info_left:
    st.markdown("#### What you can do")
    st.markdown("- Skim long PDFs in a few bullet points\n- Ask for top performers or outliers in CSVs\n- Pull key quotes or facts from a URL\n- Iterate with follow-up questions to refine")

with info_right:
    st.markdown("#### Quick tips")
    st.markdown("- Keep prompts short and specific\n- Mention the format you want (bullets, table, headline)\n- Ask one question at a time for best results\n- You can chain questions; context is remembered")

st.markdown("### Choose a source to explore")
genre = st.radio(
    "Pick the content you want to chat with", radioButtonList, index=0, key="source_radio"
)

pdfCSVURLText = ""
exampleQuestion = ""
csv_data = None
pdf_pages = None

if genre==radioButtonList[1]:
    pdfCSVURLText = "CSV"
    exampleQuestion = "What are the data columns?"
elif genre==radioButtonList[2]:
    pdfCSVURLText = "PDF"
    exampleQuestion = "Can you summarize the contents?"
elif genre==radioButtonList[3]:
    pdfCSVURLText = "URL"
    exampleQuestion = "What is Google's latest earnings?"
elif genre==radioButtonList[4]:
    pdfCSVURLText = "URL"
    exampleQuestion = "Can you summarize the contents?"
else: # Default,  E-commerce CSV
    pdfCSVURLText = "CSV"
    exampleQuestion = "Question1: What was the most sold item? Question2: What was the most common payment?"
    if os.path.exists(DEFAULT_ECOMMERCE_CSV):
        try:
            csv_data = pd.read_csv(DEFAULT_ECOMMERCE_CSV)
        except Exception as exc:
            st.warning(f"Problem loading {DEFAULT_ECOMMERCE_CSV} ({exc}). Falling back to a small sample dataset.")
    if csv_data is None:
        # Keep a tiny inline sample so the app still works even when the CSV is missing locally.
        csv_data = pd.DataFrame(
            [
                {"InvoiceNo": "536365", "StockCode": "85123A", "Description": "White hanging heart", "Quantity": 6, "UnitPrice": 2.55, "Country": "United Kingdom"},
                {"InvoiceNo": "536366", "StockCode": "71053", "Description": "White metal lantern", "Quantity": 6, "UnitPrice": 3.39, "Country": "United Kingdom"},
                {"InvoiceNo": "536367", "StockCode": "84406B", "Description": "Pink mini hanging heart", "Quantity": 8, "UnitPrice": 1.65, "Country": "United Kingdom"},
            ]
        )
        st.info(f"{DEFAULT_ECOMMERCE_CSV} not found. Using an inline sample instead. Upload your own CSV if you need the full dataset.")

st.markdown("### Add your data")
st.caption("Upload a CSV/PDF or paste a URL. The built-in e-commerce sample is ready immediately.")
if exampleQuestion:
    st.markdown(
        f"""

        <div class="section-card compact">

            <div class="section-label">Suggested prompt</div>

            <div class="prompt-chip">{exampleQuestion}</div>

        </div>

        """,
        unsafe_allow_html=True,
    )

isCustomURL = genre==radioButtonList[4]
urlInput = st.text_input('Enter your own URL', '', placeholder=f"Type your URL here (e.g. {defaultGoogleURL})", disabled=not isCustomURL)

isCustomUpload = genre==radioButtonList[1] or genre==radioButtonList[2]
uploaded_file = st.file_uploader(f"Upload your own {pdfCSVURLText} here", type=pdfCSVURLText.lower(), disabled=not isCustomUpload)
uploadedFilename = ""
if uploaded_file is not None:
    if genre==radioButtonList[1]: # Custom CSV Upload
        try:
            csv_data = pd.read_csv(uploaded_file)
        except Exception as exc:
            st.error(f"Could not read uploaded CSV: {exc}")
    elif genre==radioButtonList[2]: # Custom PDF Upload
        with NamedTemporaryFile(dir='.', suffix=f'.{pdfCSVURLText.lower()}', delete=False) as f:
            f.write(uploaded_file.getbuffer())
            uploadedFilename = f.name
        try:
            loader = PyPDFLoader(uploadedFilename)
            pdf_pages = loader.load_and_split()
        except Exception as exc:
            st.error(f"Could not read uploaded PDF: {exc}")
        finally:
            if uploadedFilename and os.path.exists(uploadedFilename):
                os.remove(uploadedFilename)

enableChatBox = False
if genre==radioButtonList[1]: # Custom CSV Upload
    enableChatBox = isinstance(csv_data, pd.DataFrame)
elif genre==radioButtonList[2]: # Custom PDF Upload
    enableChatBox = pdf_pages is not None
elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
    enableChatBox = True
elif genre==radioButtonList[4]: # Custom URL
    enableChatBox = True
else: # E-commerce CSV
    enableChatBox = True

status_class = "ready" if enableChatBox else "idle"
status_text = "Ready to chat" if enableChatBox else "Load a file or URL to start"
st.markdown(f'<div class="status-pill {status_class}">{status_text}</div>', unsafe_allow_html=True)

st.markdown("### Ask a question")
st.caption("Short, specific prompts work best. You can ask follow-ups without reloading.")
chatTextStr = st.text_input(f'Ask me anything about this {pdfCSVURLText}', '', placeholder=f"Type here (e.g. {exampleQuestion})", disabled=not enableChatBox)
chatWithPDFButton = "CLICK HERE TO START CHATTING"
if st.button(chatWithPDFButton, disabled=not enableChatBox and not chatTextStr): #  Button Cliked
    if genre==radioButtonList[0]: # E-commerce CSV
        st.write(run_duckdb_qa(chatTextStr, csv_data))

    elif genre==radioButtonList[1]: # Custom CSV Upload
        st.write(run_duckdb_qa(chatTextStr, csv_data))

    elif genre==radioButtonList[2]: # Custom PDF Upload
        content = build_prompt("Uploaded PDF", pdf_pages, chatTextStr)
        st.write(call_openrouter(content))
    elif genre==radioButtonList[3]: # Google Alphabet URL Earnings Report
        loader = WebBaseLoader(defaultGoogleURL)
        web_data = loader.load()
        content = build_prompt("Google earnings URL", web_data, chatTextStr)
        st.write(call_openrouter(content))
    elif genre==radioButtonList[4]: # Custom URL
        if not urlInput.strip():
            st.warning("Please enter a URL first.")
        else:
            loader = WebBaseLoader(urlInput)
            web_data = loader.load()
            content = build_prompt("Custom URL", web_data, chatTextStr)
            st.write(call_openrouter(content))