Spaces:

CatLLM
/

survey-summarizer

Sleeping

File size: 25,731 Bytes

c44ee53

"""
Streamlit app - CatLLM Survey Response Summarizer
Based on the classifier app but focused on text/PDF summarization
"""

import streamlit as st
import pandas as pd
import tempfile
import os
import time
import sys
from datetime import datetime

# Import catllm
try:
    import catllm
    CATLLM_AVAILABLE = True
except ImportError as e:
    print(f"Warning: Could not import catllm: {e}")
    CATLLM_AVAILABLE = False

MAX_FILE_SIZE_MB = 100

def count_pdf_pages(pdf_path):
    """Count the number of pages in a PDF file."""
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(pdf_path)
        page_count = len(doc)
        doc.close()
        return page_count
    except Exception:
        return 1  # Default to 1 if can't read


# Free models - display name -> actual API model name
FREE_MODELS_MAP = {
    "Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
    "DeepSeek V3.1": "deepseek-ai/DeepSeek-V3.1:novita",
    "Llama 3.3 70B": "meta-llama/Llama-3.3-70B-Instruct:groq",
    "Gemini 2.5 Flash": "gemini-2.5-flash",
    "GPT-4o Mini": "gpt-4o-mini",
    "Mistral Medium": "mistral-medium-2505",
    "Claude 3 Haiku": "claude-3-haiku-20240307",
    "Grok 4 Fast": "grok-4-fast-non-reasoning",
}
FREE_MODEL_DISPLAY_NAMES = list(FREE_MODELS_MAP.keys())

# Paid models (user provides their own API key)
PAID_MODEL_CHOICES = [
    "gpt-4.1",
    "gpt-4o",
    "gpt-4o-mini",
    "claude-sonnet-4-5-20250929",
    "claude-opus-4-20250514",
    "claude-3-5-haiku-20241022",
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "mistral-large-latest",
]

# Models routed through HuggingFace
HF_ROUTED_MODELS = [
    "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
    "deepseek-ai/DeepSeek-V3.1:novita",
    "meta-llama/Llama-3.3-70B-Instruct:groq",
]


def is_free_model(model, model_tier):
    """Check if using free tier (Space pays for API)."""
    return model_tier == "Free Models"


def get_model_source(model):
    """Auto-detect model source."""
    model_lower = model.lower()
    if "gpt" in model_lower:
        return "openai"
    elif "claude" in model_lower:
        return "anthropic"
    elif "gemini" in model_lower:
        return "google"
    elif "mistral" in model_lower and ":novita" not in model_lower:
        return "mistral"
    elif any(x in model_lower for x in [":novita", ":groq", "qwen", "llama", "deepseek"]):
        return "huggingface"
    elif "sonar" in model_lower:
        return "perplexity"
    elif "grok" in model_lower:
        return "xai"
    return "huggingface"


def get_api_key(model, model_tier, api_key_input):
    """Get the appropriate API key based on model and tier."""
    if is_free_model(model, model_tier):
        if model in HF_ROUTED_MODELS:
            return os.environ.get("HF_API_KEY", ""), "HuggingFace"
        elif "gpt" in model.lower():
            return os.environ.get("OPENAI_API_KEY", ""), "OpenAI"
        elif "gemini" in model.lower():
            return os.environ.get("GOOGLE_API_KEY", ""), "Google"
        elif "mistral" in model.lower():
            return os.environ.get("MISTRAL_API_KEY", ""), "Mistral"
        elif "claude" in model.lower():
            return os.environ.get("ANTHROPIC_API_KEY", ""), "Anthropic"
        elif "sonar" in model.lower():
            return os.environ.get("PERPLEXITY_API_KEY", ""), "Perplexity"
        elif "grok" in model.lower():
            return os.environ.get("XAI_API_KEY", ""), "xAI"
        else:
            return os.environ.get("HF_API_KEY", ""), "HuggingFace"
    else:
        if api_key_input and api_key_input.strip():
            return api_key_input.strip(), "User"
        return "", "User"


def generate_summarize_code(input_type, description, model, model_source, focus=None, max_length=None, instructions=None, mode=None):
    """Generate Python code for summarization."""
    focus_param = f',\n    focus="{focus}"' if focus else ''
    length_param = f',\n    max_length={max_length}' if max_length else ''
    instructions_param = f',\n    instructions="{instructions}"' if instructions else ''

    if input_type == "text":
        return f'''import catllm
import pandas as pd

# Load your data
df = pd.read_csv("your_data.csv")

# Summarize the text column
result = catllm.summarize(
    input_data=df["your_column"].tolist(),
    api_key="YOUR_API_KEY",
    description="{description}",
    user_model="{model}",
    model_source="{model_source}"{focus_param}{length_param}{instructions_param}
)

# View results
print(result)
result.to_csv("summarized_results.csv", index=False)
'''
    else:  # pdf
        mode_param = f',\n    mode="{mode}"' if mode else ''
        return f'''import catllm

# Summarize PDF documents
result = catllm.summarize(
    input_data="path/to/your/pdfs/",
    api_key="YOUR_API_KEY",
    description="{description}",
    user_model="{model}",
    model_source="{model_source}"{mode_param}{focus_param}{length_param}{instructions_param}
)

# View results
print(result)
result.to_csv("summarized_results.csv", index=False)
'''


def generate_methodology_report_pdf(model, column_name, num_rows, model_source, filename, success_rate,
                          result_df=None, processing_time=None,
                          catllm_version=None, python_version=None,
                          input_type="text", description=None, focus=None, max_length=None):
    """Generate a PDF methodology report for summarization."""
    from reportlab.lib.pagesizes import letter
    from reportlab.lib import colors
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak

    pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_methodology_report.pdf', delete=False)
    doc = SimpleDocTemplate(pdf_file.name, pagesize=letter)
    styles = getSampleStyleSheet()

    title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
    heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
    normal_style = styles['Normal']

    story = []

    report_title = "CatLLM Summarization Report"
    story.append(Paragraph(report_title, title_style))
    story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
    story.append(Spacer(1, 15))

    story.append(Paragraph("About This Report", heading_style))
    about_text = """This methodology report documents the automated summarization process. \
CatLLM uses LLMs to generate concise summaries of text or PDF documents, providing \
consistent and reproducible results."""
    story.append(Paragraph(about_text, normal_style))
    story.append(Spacer(1, 15))

    # Summary section
    story.append(Paragraph("Summarization Summary", heading_style))
    story.append(Spacer(1, 10))

    summary_data = [
        ["Source File", filename],
        ["Source Column/Type", column_name],
        ["Model Used", model],
        ["Model Source", model_source],
        ["Items Summarized", str(num_rows)],
        ["Success Rate", f"{success_rate:.2f}%"],
    ]
    if focus:
        summary_data.append(["Focus", focus])
    if max_length:
        summary_data.append(["Max Length", f"{max_length} words"])

    summary_table = Table(summary_data, colWidths=[150, 300])
    summary_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ('PADDING', (0, 0), (-1, -1), 6),
        ('FONTSIZE', (0, 0), (-1, -1), 9),
    ]))
    story.append(summary_table)
    story.append(Spacer(1, 15))

    if processing_time is not None:
        story.append(Paragraph("Processing Time", heading_style))
        rows_per_min = (num_rows / processing_time) * 60 if processing_time > 0 else 0
        avg_time = processing_time / num_rows if num_rows > 0 else 0

        time_data = [
            ["Total Processing Time", f"{processing_time:.1f} seconds"],
            ["Average Time per Item", f"{avg_time:.2f} seconds"],
            ["Processing Rate", f"{rows_per_min:.1f} items/minute"],
        ]
        time_table = Table(time_data, colWidths=[180, 270])
        time_table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
            ('PADDING', (0, 0), (-1, -1), 6),
            ('FONTSIZE', (0, 0), (-1, -1), 9),
        ]))
        story.append(time_table)

    story.append(Spacer(1, 15))
    story.append(Paragraph("Version Information", heading_style))
    version_data = [
        ["CatLLM Version", catllm_version or "unknown"],
        ["Python Version", python_version or "unknown"],
        ["Timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
    ]
    version_table = Table(version_data, colWidths=[180, 270])
    version_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ('PADDING', (0, 0), (-1, -1), 6),
        ('FONTSIZE', (0, 0), (-1, -1), 9),
    ]))
    story.append(version_table)

    story.append(Spacer(1, 30))
    story.append(Paragraph("Citation", heading_style))
    story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
    story.append(Spacer(1, 5))
    story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DOI: 10.5281/zenodo.15532316", normal_style))

    doc.build(story)
    return pdf_file.name


# Page config
st.set_page_config(
    page_title="CatLLM - Research Data Summarizer",
    page_icon="🐱",
    layout="wide"
)

# Initialize session state
if 'results' not in st.session_state:
    st.session_state.results = None
if 'survey_data' not in st.session_state:
    st.session_state.survey_data = None
if 'pdf_data' not in st.session_state:
    st.session_state.pdf_data = None

# Logo and title
col_logo, col_title = st.columns([1, 6])
with col_logo:
    st.image("logo.png", width=100)
with col_title:
    st.title("CatLLM - Research Data Summarizer")
    st.markdown("Generate concise summaries of survey responses and PDF documents using LLMs.")

# About section
with st.expander("About This App"):
    st.markdown("""
**Privacy Notice:** Your data is sent to third-party LLM APIs for summarization. Do not upload sensitive, confidential, or personally identifiable information (PII).

---

**CatLLM** is an open-source Python package for processing text and document data using Large Language Models.

### What It Does
- **Summarize Text**: Generate concise summaries of survey responses or text data
- **Summarize PDFs**: Extract key information from PDF documents page-by-page
- **Focus Summaries**: Guide the model to focus on specific aspects of your data

### Beta Test - We Want Your Feedback!
This app is currently in **beta** and **free to use** while CatLLM is under review for publication, made possible by **Bashir Ahmed's generous fellowship support**.

- Found a bug? Have a feature request? Please open an issue on [GitHub](https://github.com/chrissoria/cat-llm)
- Reach out directly: [chrissoria@berkeley.edu](mailto:chrissoria@berkeley.edu)

### Links
- **PyPI**: [pip install cat-llm](https://pypi.org/project/cat-llm/)
- **GitHub**: [github.com/chrissoria/cat-llm](https://github.com/chrissoria/cat-llm)
- **Classifier App**: [CatLLM Survey Classifier](https://huggingface.co/spaces/CatLLM/survey-classifier)

### Citation
If you use CatLLM in your research, please cite:
```
Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DOI: 10.5281/zenodo.15532316
```
""")

# Main layout
col_input, col_output = st.columns([1, 1])

with col_input:
    # Input type selector
    input_type_choice = st.radio(
        "Input Type",
        options=["Survey Responses", "PDF Documents"],
        horizontal=True,
        key="input_type_radio"
    )

    # Initialize variables
    input_data = None
    input_type_selected = "text"
    description = ""
    original_filename = "data"
    pdf_mode = "Image (visual documents)"

    if input_type_choice == "Survey Responses":
        input_type_selected = "text"

        uploaded_file = st.file_uploader(
            "Upload Data (CSV or Excel)",
            type=['csv', 'xlsx', 'xls'],
            key="survey_file"
        )

        if st.button("Try Example Dataset", key="example_btn"):
            st.session_state.example_loaded = True

        columns = []
        df = None
        if uploaded_file is not None:
            try:
                if uploaded_file.name.endswith('.csv'):
                    df = pd.read_csv(uploaded_file)
                else:
                    df = pd.read_excel(uploaded_file)
                columns = df.columns.tolist()
                st.success(f"Loaded {len(df):,} rows")
            except Exception as e:
                st.error(f"Error loading file: {e}")
        elif hasattr(st.session_state, 'example_loaded') and st.session_state.example_loaded:
            try:
                df = pd.read_csv("example_data.csv")
                columns = df.columns.tolist()
                st.success(f"Loaded example dataset ({len(df)} rows)")
            except:
                pass

        selected_column = st.selectbox(
            "Column to Summarize",
            options=columns if columns else ["Upload a file first"],
            disabled=not columns,
            key="survey_column"
        )

        description = selected_column if columns else ""
        original_filename = uploaded_file.name if uploaded_file else "example_data.csv"

        if df is not None and columns and selected_column in columns:
            input_data = df[selected_column].tolist()

    else:  # PDF Documents
        input_type_selected = "pdf"

        pdf_files = st.file_uploader(
            "Upload PDF Document(s)",
            type=['pdf'],
            accept_multiple_files=True,
            key="pdf_files"
        )

        pdf_description = st.text_input(
            "Document Description",
            placeholder="e.g., 'research papers', 'interview transcripts'",
            help="Helps the LLM understand context",
            key="pdf_desc"
        )

        pdf_mode = st.radio(
            "Processing Mode",
            options=["Image (visual documents)", "Text (text-heavy)", "Both (comprehensive)"],
            key="pdf_mode"
        )

        if pdf_files:
            input_data = []
            pdf_name_map = {}  # Map temp paths to original filenames
            for f in pdf_files:
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
                    tmp.write(f.read())
                    input_data.append(tmp.name)
                    pdf_name_map[tmp.name] = f.name.replace('.pdf', '')
            st.session_state.pdf_name_map = pdf_name_map
            description = pdf_description or "document"
            original_filename = "pdf_files"
            st.success(f"Uploaded {len(pdf_files)} PDF file(s)")

    st.markdown("---")

    # Summarization options
    st.markdown("### Summarization Options")

    focus = st.text_input(
        "Focus (optional)",
        placeholder="e.g., 'main arguments', 'emotional content', 'key findings'",
        help="Guide the model to focus on specific aspects"
    )

    max_length = st.number_input(
        "Maximum Summary Length (words, optional)",
        min_value=0,
        max_value=1000,
        value=0,
        help="Leave at 0 for no limit"
    )
    max_length = max_length if max_length > 0 else None

    instructions = st.text_input(
        "Additional Instructions (optional)",
        placeholder="e.g., 'use bullet points', 'include quotes'",
        help="Custom instructions for the summarization"
    )

    st.markdown("---")

    # Model selection
    st.markdown("### Model Selection")
    model_tier = st.radio(
        "Model Tier",
        options=["Free Models", "Bring Your Own Key"],
        key="model_tier"
    )

    if model_tier == "Free Models":
        model_display = st.selectbox("Model", options=FREE_MODEL_DISPLAY_NAMES, key="model")
        model = FREE_MODELS_MAP[model_display]
        api_key = ""
    else:
        model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="model_paid")
        api_key = st.text_input("API Key", type="password", key="api_key")

    # Summarize button
    if st.button("Summarize Data", type="primary", use_container_width=True):
        if input_data is None:
            st.error("Please upload data first")
        else:
            mode = None
            if input_type_selected == "pdf":
                mode_mapping = {
                    "Image (visual documents)": "image",
                    "Text (text-heavy)": "text",
                    "Both (comprehensive)": "both"
                }
                mode = mode_mapping.get(pdf_mode, "image")

            actual_api_key, provider = get_api_key(model, model_tier, api_key)
            if not actual_api_key:
                st.error(f"{provider} API key not configured")
            else:
                model_source = get_model_source(model)
                items_list = input_data if isinstance(input_data, list) else [input_data]

                # Calculate estimated time
                num_items = len(items_list)
                if input_type_selected == "pdf":
                    total_pages = sum(count_pdf_pages(p) for p in items_list)
                    est_seconds = total_pages * 5
                else:
                    est_seconds = max(10, num_items * 2)

                est_time_str = f"{est_seconds:.0f}s" if est_seconds < 60 else f"{est_seconds/60:.1f}m"

                # Progress UI
                progress_bar = st.progress(0)
                status_text = st.empty()
                start_time = time.time()

                def progress_callback(current_idx, total, label=None):
                    progress = current_idx / total if total > 0 else 0
                    progress_bar.progress(min(progress, 1.0))

                    elapsed = time.time() - start_time
                    if current_idx > 0:
                        avg_time = elapsed / current_idx
                        eta_seconds = avg_time * (total - current_idx)
                        eta_str = f" | ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" | ETA: {eta_seconds/60:.1f}m"
                    else:
                        eta_str = ""

                    label_str = f" ({label})" if label else ""
                    status_text.text(f"Processing item {current_idx+1} of {total}{label_str} ({progress*100:.0f}%){eta_str}")

                try:
                    # Build kwargs for summarize
                    summarize_kwargs = {
                        "input_data": items_list,
                        "api_key": actual_api_key,
                        "description": description,
                        "user_model": model,
                        "model_source": model_source,
                        "progress_callback": progress_callback,
                    }
                    if mode:
                        summarize_kwargs["mode"] = mode
                    if focus and focus.strip():
                        summarize_kwargs["focus"] = focus.strip()
                    if max_length:
                        summarize_kwargs["max_length"] = max_length
                    if instructions and instructions.strip():
                        summarize_kwargs["instructions"] = instructions.strip()

                    result_df = catllm.summarize(**summarize_kwargs)

                    processing_time = time.time() - start_time
                    total_items = len(result_df)
                    progress_bar.progress(1.0)
                    status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")

                    # Replace temp paths with original filenames for PDF input
                    if input_type_selected == "pdf" and 'pdf_path' in result_df.columns:
                        pdf_name_map = st.session_state.get('pdf_name_map', {})
                        def replace_temp_path(val):
                            if pd.isna(val):
                                return val
                            val_str = str(val)
                            for temp_path, orig_name in pdf_name_map.items():
                                if temp_path in val_str:
                                    return val_str.replace(temp_path, orig_name + '.pdf')
                            return val_str
                        result_df['pdf_path'] = result_df['pdf_path'].apply(replace_temp_path)

                    # Save CSV
                    with tempfile.NamedTemporaryFile(mode='w', suffix='_summarized.csv', delete=False) as f:
                        result_df.to_csv(f.name, index=False)
                        csv_path = f.name

                    # Calculate success rate
                    if 'processing_status' in result_df.columns:
                        success_count = (result_df['processing_status'] == 'success').sum()
                        success_rate = (success_count / len(result_df)) * 100
                    else:
                        success_rate = 100.0

                    # Get version info
                    try:
                        catllm_version = catllm.__version__
                    except AttributeError:
                        catllm_version = "unknown"
                    python_version = sys.version.split()[0]

                    # Generate methodology report
                    pdf_path = generate_methodology_report_pdf(
                        model=model,
                        column_name=description,
                        num_rows=total_items,
                        model_source=model_source,
                        filename=original_filename,
                        success_rate=success_rate,
                        result_df=result_df,
                        processing_time=processing_time,
                        catllm_version=catllm_version,
                        python_version=python_version,
                        input_type=input_type_selected,
                        description=description,
                        focus=focus if focus else None,
                        max_length=max_length
                    )

                    # Generate code
                    code = generate_summarize_code(
                        input_type_selected, description, model, model_source,
                        focus=focus if focus else None,
                        max_length=max_length,
                        instructions=instructions if instructions else None,
                        mode=mode
                    )

                    st.session_state.results = {
                        'df': result_df,
                        'csv_path': csv_path,
                        'pdf_path': pdf_path,
                        'code': code,
                        'status': f"Summarized {total_items} items in {processing_time:.1f}s",
                    }
                    st.success(f"Summarized {total_items} items in {processing_time:.1f}s")
                    st.rerun()

                except Exception as e:
                    st.error(f"Error: {str(e)}")

with col_output:
    st.markdown("### Results")

    if st.session_state.results:
        results = st.session_state.results

        # Placeholder for future chart
        st.info("Summary visualization coming soon!")

        # Results dataframe
        display_df = results['df'].copy()
        cols_to_hide = ['model_response', 'json', 'raw_response', 'raw_json']
        display_df = display_df.drop(columns=[c for c in cols_to_hide if c in display_df.columns])
        st.dataframe(display_df, use_container_width=True)

        # Downloads
        col_dl1, col_dl2 = st.columns(2)
        with col_dl1:
            with open(results['csv_path'], 'rb') as f:
                st.download_button(
                    "Download Results (CSV)",
                    data=f,
                    file_name="summarized_results.csv",
                    mime="text/csv"
                )
        with col_dl2:
            with open(results['pdf_path'], 'rb') as f:
                st.download_button(
                    "Download Methodology Report (PDF)",
                    data=f,
                    file_name="methodology_report.pdf",
                    mime="application/pdf"
                )

        # Code
        with st.expander("See the Code"):
            st.code(results['code'], language='python')
    else:
        st.info("Upload data and click 'Summarize Data' to see results here.")

# Bottom buttons
col_reset, col_code = st.columns(2)
with col_reset:
    if st.button("Reset", type="secondary", use_container_width=True):
        st.session_state.results = None
        if hasattr(st.session_state, 'example_loaded'):
            del st.session_state.example_loaded
        st.rerun()

with col_code:
    if st.session_state.results:
        if st.button("See in Code", use_container_width=True):
            st.session_state.show_code_modal = True

# Code modal/dialog
if st.session_state.get('show_code_modal') and st.session_state.results:
    st.markdown("---")
    st.markdown("### Reproducibility Code")
    st.markdown("Use this code to reproduce the summarization with the CatLLM Python package:")
    st.code(st.session_state.results['code'], language='python')
    if st.button("Close"):
        st.session_state.show_code_modal = False
        st.rerun()