Spaces:

CatLLM
/

survey-summarizer

Running

App Files Files Community

survey-summarizer / app.py

chrissoria

Initial summarizer app

c44ee53 3 days ago

raw

history blame contribute delete

25.7 kB

	"""
	Streamlit app - CatLLM Survey Response Summarizer
	Based on the classifier app but focused on text/PDF summarization
	"""

	import streamlit as st
	import pandas as pd
	import tempfile
	import os
	import time
	import sys
	from datetime import datetime

	# Import catllm
	try:
	import catllm
	CATLLM_AVAILABLE = True
	except ImportError as e:
	print(f"Warning: Could not import catllm: {e}")
	CATLLM_AVAILABLE = False

	MAX_FILE_SIZE_MB = 100

	def count_pdf_pages(pdf_path):
	"""Count the number of pages in a PDF file."""
	try:
	import fitz # PyMuPDF
	doc = fitz.open(pdf_path)
	page_count = len(doc)
	doc.close()
	return page_count
	except Exception:
	return 1 # Default to 1 if can't read


	# Free models - display name -> actual API model name
	FREE_MODELS_MAP = {
	"Qwen3 235B": "Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
	"DeepSeek V3.1": "deepseek-ai/DeepSeek-V3.1:novita",
	"Llama 3.3 70B": "meta-llama/Llama-3.3-70B-Instruct:groq",
	"Gemini 2.5 Flash": "gemini-2.5-flash",
	"GPT-4o Mini": "gpt-4o-mini",
	"Mistral Medium": "mistral-medium-2505",
	"Claude 3 Haiku": "claude-3-haiku-20240307",
	"Grok 4 Fast": "grok-4-fast-non-reasoning",
	}
	FREE_MODEL_DISPLAY_NAMES = list(FREE_MODELS_MAP.keys())

	# Paid models (user provides their own API key)
	PAID_MODEL_CHOICES = [
	"gpt-4.1",
	"gpt-4o",
	"gpt-4o-mini",
	"claude-sonnet-4-5-20250929",
	"claude-opus-4-20250514",
	"claude-3-5-haiku-20241022",
	"gemini-2.5-pro",
	"gemini-2.5-flash",
	"mistral-large-latest",
	]

	# Models routed through HuggingFace
	HF_ROUTED_MODELS = [
	"Qwen/Qwen3-VL-235B-A22B-Instruct:novita",
	"deepseek-ai/DeepSeek-V3.1:novita",
	"meta-llama/Llama-3.3-70B-Instruct:groq",
	]


	def is_free_model(model, model_tier):
	"""Check if using free tier (Space pays for API)."""
	return model_tier == "Free Models"


	def get_model_source(model):
	"""Auto-detect model source."""
	model_lower = model.lower()
	if "gpt" in model_lower:
	return "openai"
	elif "claude" in model_lower:
	return "anthropic"
	elif "gemini" in model_lower:
	return "google"
	elif "mistral" in model_lower and ":novita" not in model_lower:
	return "mistral"
	elif any(x in model_lower for x in [":novita", ":groq", "qwen", "llama", "deepseek"]):
	return "huggingface"
	elif "sonar" in model_lower:
	return "perplexity"
	elif "grok" in model_lower:
	return "xai"
	return "huggingface"


	def get_api_key(model, model_tier, api_key_input):
	"""Get the appropriate API key based on model and tier."""
	if is_free_model(model, model_tier):
	if model in HF_ROUTED_MODELS:
	return os.environ.get("HF_API_KEY", ""), "HuggingFace"
	elif "gpt" in model.lower():
	return os.environ.get("OPENAI_API_KEY", ""), "OpenAI"
	elif "gemini" in model.lower():
	return os.environ.get("GOOGLE_API_KEY", ""), "Google"
	elif "mistral" in model.lower():
	return os.environ.get("MISTRAL_API_KEY", ""), "Mistral"
	elif "claude" in model.lower():
	return os.environ.get("ANTHROPIC_API_KEY", ""), "Anthropic"
	elif "sonar" in model.lower():
	return os.environ.get("PERPLEXITY_API_KEY", ""), "Perplexity"
	elif "grok" in model.lower():
	return os.environ.get("XAI_API_KEY", ""), "xAI"
	else:
	return os.environ.get("HF_API_KEY", ""), "HuggingFace"
	else:
	if api_key_input and api_key_input.strip():
	return api_key_input.strip(), "User"
	return "", "User"


	def generate_summarize_code(input_type, description, model, model_source, focus=None, max_length=None, instructions=None, mode=None):
	"""Generate Python code for summarization."""
	focus_param = f',\n focus="{focus}"' if focus else ''
	length_param = f',\n max_length={max_length}' if max_length else ''
	instructions_param = f',\n instructions="{instructions}"' if instructions else ''

	if input_type == "text":
	return f'''import catllm
	import pandas as pd

	# Load your data
	df = pd.read_csv("your_data.csv")

	# Summarize the text column
	result = catllm.summarize(
	input_data=df["your_column"].tolist(),
	api_key="YOUR_API_KEY",
	description="{description}",
	user_model="{model}",
	model_source="{model_source}"{focus_param}{length_param}{instructions_param}
	)

	# View results
	print(result)
	result.to_csv("summarized_results.csv", index=False)
	'''
	else: # pdf
	mode_param = f',\n mode="{mode}"' if mode else ''
	return f'''import catllm

	# Summarize PDF documents
	result = catllm.summarize(
	input_data="path/to/your/pdfs/",
	api_key="YOUR_API_KEY",
	description="{description}",
	user_model="{model}",
	model_source="{model_source}"{mode_param}{focus_param}{length_param}{instructions_param}
	)

	# View results
	print(result)
	result.to_csv("summarized_results.csv", index=False)
	'''


	def generate_methodology_report_pdf(model, column_name, num_rows, model_source, filename, success_rate,
	result_df=None, processing_time=None,
	catllm_version=None, python_version=None,
	input_type="text", description=None, focus=None, max_length=None):
	"""Generate a PDF methodology report for summarization."""
	from reportlab.lib.pagesizes import letter
	from reportlab.lib import colors
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak

	pdf_file = tempfile.NamedTemporaryFile(mode='wb', suffix='_methodology_report.pdf', delete=False)
	doc = SimpleDocTemplate(pdf_file.name, pagesize=letter)
	styles = getSampleStyleSheet()

	title_style = ParagraphStyle('Title', parent=styles['Heading1'], fontSize=18, spaceAfter=20)
	heading_style = ParagraphStyle('Heading', parent=styles['Heading2'], fontSize=14, spaceAfter=10, spaceBefore=15)
	normal_style = styles['Normal']

	story = []

	report_title = "CatLLM Summarization Report"
	story.append(Paragraph(report_title, title_style))
	story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", normal_style))
	story.append(Spacer(1, 15))

	story.append(Paragraph("About This Report", heading_style))
	about_text = """This methodology report documents the automated summarization process. \
	CatLLM uses LLMs to generate concise summaries of text or PDF documents, providing \
	consistent and reproducible results."""
	story.append(Paragraph(about_text, normal_style))
	story.append(Spacer(1, 15))

	# Summary section
	story.append(Paragraph("Summarization Summary", heading_style))
	story.append(Spacer(1, 10))

	summary_data = [
	["Source File", filename],
	["Source Column/Type", column_name],
	["Model Used", model],
	["Model Source", model_source],
	["Items Summarized", str(num_rows)],
	["Success Rate", f"{success_rate:.2f}%"],
	]
	if focus:
	summary_data.append(["Focus", focus])
	if max_length:
	summary_data.append(["Max Length", f"{max_length} words"])

	summary_table = Table(summary_data, colWidths=[150, 300])
	summary_table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	('PADDING', (0, 0), (-1, -1), 6),
	('FONTSIZE', (0, 0), (-1, -1), 9),
	]))
	story.append(summary_table)
	story.append(Spacer(1, 15))

	if processing_time is not None:
	story.append(Paragraph("Processing Time", heading_style))
	rows_per_min = (num_rows / processing_time) * 60 if processing_time > 0 else 0
	avg_time = processing_time / num_rows if num_rows > 0 else 0

	time_data = [
	["Total Processing Time", f"{processing_time:.1f} seconds"],
	["Average Time per Item", f"{avg_time:.2f} seconds"],
	["Processing Rate", f"{rows_per_min:.1f} items/minute"],
	]
	time_table = Table(time_data, colWidths=[180, 270])
	time_table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	('PADDING', (0, 0), (-1, -1), 6),
	('FONTSIZE', (0, 0), (-1, -1), 9),
	]))
	story.append(time_table)

	story.append(Spacer(1, 15))
	story.append(Paragraph("Version Information", heading_style))
	version_data = [
	["CatLLM Version", catllm_version or "unknown"],
	["Python Version", python_version or "unknown"],
	["Timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
	]
	version_table = Table(version_data, colWidths=[180, 270])
	version_table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	('PADDING', (0, 0), (-1, -1), 6),
	('FONTSIZE', (0, 0), (-1, -1), 9),
	]))
	story.append(version_table)

	story.append(Spacer(1, 30))
	story.append(Paragraph("Citation", heading_style))
	story.append(Paragraph("If you use CatLLM in your research, please cite:", normal_style))
	story.append(Spacer(1, 5))
	story.append(Paragraph("Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DOI: 10.5281/zenodo.15532316", normal_style))

	doc.build(story)
	return pdf_file.name


	# Page config
	st.set_page_config(
	page_title="CatLLM - Research Data Summarizer",
	page_icon="🐱",
	layout="wide"
	)

	# Initialize session state
	if 'results' not in st.session_state:
	st.session_state.results = None
	if 'survey_data' not in st.session_state:
	st.session_state.survey_data = None
	if 'pdf_data' not in st.session_state:
	st.session_state.pdf_data = None

	# Logo and title
	col_logo, col_title = st.columns([1, 6])
	with col_logo:
	st.image("logo.png", width=100)
	with col_title:
	st.title("CatLLM - Research Data Summarizer")
	st.markdown("Generate concise summaries of survey responses and PDF documents using LLMs.")

	# About section
	with st.expander("About This App"):
	st.markdown("""
	Privacy Notice: Your data is sent to third-party LLM APIs for summarization. Do not upload sensitive, confidential, or personally identifiable information (PII).

	---

	CatLLM is an open-source Python package for processing text and document data using Large Language Models.

	### What It Does
	- Summarize Text: Generate concise summaries of survey responses or text data
	- Summarize PDFs: Extract key information from PDF documents page-by-page
	- Focus Summaries: Guide the model to focus on specific aspects of your data

	### Beta Test - We Want Your Feedback!
	This app is currently in beta and free to use while CatLLM is under review for publication, made possible by Bashir Ahmed's generous fellowship support.

	- Found a bug? Have a feature request? Please open an issue on [GitHub](https://github.com/chrissoria/cat-llm)
	- Reach out directly: [chrissoria@berkeley.edu](mailto:chrissoria@berkeley.edu)

	### Links
	- PyPI: [pip install cat-llm](https://pypi.org/project/cat-llm/)
	- GitHub: [github.com/chrissoria/cat-llm](https://github.com/chrissoria/cat-llm)
	- Classifier App: [CatLLM Survey Classifier](https://huggingface.co/spaces/CatLLM/survey-classifier)

	### Citation
	If you use CatLLM in your research, please cite:
	```
	Soria, C. (2025). CatLLM: A Python package for LLM-based text classification. DOI: 10.5281/zenodo.15532316
	```
	""")

	# Main layout
	col_input, col_output = st.columns([1, 1])

	with col_input:
	# Input type selector
	input_type_choice = st.radio(
	"Input Type",
	options=["Survey Responses", "PDF Documents"],
	horizontal=True,
	key="input_type_radio"
	)

	# Initialize variables
	input_data = None
	input_type_selected = "text"
	description = ""
	original_filename = "data"
	pdf_mode = "Image (visual documents)"

	if input_type_choice == "Survey Responses":
	input_type_selected = "text"

	uploaded_file = st.file_uploader(
	"Upload Data (CSV or Excel)",
	type=['csv', 'xlsx', 'xls'],
	key="survey_file"
	)

	if st.button("Try Example Dataset", key="example_btn"):
	st.session_state.example_loaded = True

	columns = []
	df = None
	if uploaded_file is not None:
	try:
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file)
	else:
	df = pd.read_excel(uploaded_file)
	columns = df.columns.tolist()
	st.success(f"Loaded {len(df):,} rows")
	except Exception as e:
	st.error(f"Error loading file: {e}")
	elif hasattr(st.session_state, 'example_loaded') and st.session_state.example_loaded:
	try:
	df = pd.read_csv("example_data.csv")
	columns = df.columns.tolist()
	st.success(f"Loaded example dataset ({len(df)} rows)")
	except:
	pass

	selected_column = st.selectbox(
	"Column to Summarize",
	options=columns if columns else ["Upload a file first"],
	disabled=not columns,
	key="survey_column"
	)

	description = selected_column if columns else ""
	original_filename = uploaded_file.name if uploaded_file else "example_data.csv"

	if df is not None and columns and selected_column in columns:
	input_data = df[selected_column].tolist()

	else: # PDF Documents
	input_type_selected = "pdf"

	pdf_files = st.file_uploader(
	"Upload PDF Document(s)",
	type=['pdf'],
	accept_multiple_files=True,
	key="pdf_files"
	)

	pdf_description = st.text_input(
	"Document Description",
	placeholder="e.g., 'research papers', 'interview transcripts'",
	help="Helps the LLM understand context",
	key="pdf_desc"
	)

	pdf_mode = st.radio(
	"Processing Mode",
	options=["Image (visual documents)", "Text (text-heavy)", "Both (comprehensive)"],
	key="pdf_mode"
	)

	if pdf_files:
	input_data = []
	pdf_name_map = {} # Map temp paths to original filenames
	for f in pdf_files:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
	tmp.write(f.read())
	input_data.append(tmp.name)
	pdf_name_map[tmp.name] = f.name.replace('.pdf', '')
	st.session_state.pdf_name_map = pdf_name_map
	description = pdf_description or "document"
	original_filename = "pdf_files"
	st.success(f"Uploaded {len(pdf_files)} PDF file(s)")

	st.markdown("---")

	# Summarization options
	st.markdown("### Summarization Options")

	focus = st.text_input(
	"Focus (optional)",
	placeholder="e.g., 'main arguments', 'emotional content', 'key findings'",
	help="Guide the model to focus on specific aspects"
	)

	max_length = st.number_input(
	"Maximum Summary Length (words, optional)",
	min_value=0,
	max_value=1000,
	value=0,
	help="Leave at 0 for no limit"
	)
	max_length = max_length if max_length > 0 else None

	instructions = st.text_input(
	"Additional Instructions (optional)",
	placeholder="e.g., 'use bullet points', 'include quotes'",
	help="Custom instructions for the summarization"
	)

	st.markdown("---")

	# Model selection
	st.markdown("### Model Selection")
	model_tier = st.radio(
	"Model Tier",
	options=["Free Models", "Bring Your Own Key"],
	key="model_tier"
	)

	if model_tier == "Free Models":
	model_display = st.selectbox("Model", options=FREE_MODEL_DISPLAY_NAMES, key="model")
	model = FREE_MODELS_MAP[model_display]
	api_key = ""
	else:
	model = st.selectbox("Model", options=PAID_MODEL_CHOICES, key="model_paid")
	api_key = st.text_input("API Key", type="password", key="api_key")

	# Summarize button
	if st.button("Summarize Data", type="primary", use_container_width=True):
	if input_data is None:
	st.error("Please upload data first")
	else:
	mode = None
	if input_type_selected == "pdf":
	mode_mapping = {
	"Image (visual documents)": "image",
	"Text (text-heavy)": "text",
	"Both (comprehensive)": "both"
	}
	mode = mode_mapping.get(pdf_mode, "image")

	actual_api_key, provider = get_api_key(model, model_tier, api_key)
	if not actual_api_key:
	st.error(f"{provider} API key not configured")
	else:
	model_source = get_model_source(model)
	items_list = input_data if isinstance(input_data, list) else [input_data]

	# Calculate estimated time
	num_items = len(items_list)
	if input_type_selected == "pdf":
	total_pages = sum(count_pdf_pages(p) for p in items_list)
	est_seconds = total_pages * 5
	else:
	est_seconds = max(10, num_items * 2)

	est_time_str = f"{est_seconds:.0f}s" if est_seconds < 60 else f"{est_seconds/60:.1f}m"

	# Progress UI
	progress_bar = st.progress(0)
	status_text = st.empty()
	start_time = time.time()

	def progress_callback(current_idx, total, label=None):
	progress = current_idx / total if total > 0 else 0
	progress_bar.progress(min(progress, 1.0))

	elapsed = time.time() - start_time
	if current_idx > 0:
	avg_time = elapsed / current_idx
	eta_seconds = avg_time * (total - current_idx)
	eta_str = f" \| ETA: {eta_seconds:.0f}s" if eta_seconds < 60 else f" \| ETA: {eta_seconds/60:.1f}m"
	else:
	eta_str = ""

	label_str = f" ({label})" if label else ""
	status_text.text(f"Processing item {current_idx+1} of {total}{label_str} ({progress*100:.0f}%){eta_str}")

	try:
	# Build kwargs for summarize
	summarize_kwargs = {
	"input_data": items_list,
	"api_key": actual_api_key,
	"description": description,
	"user_model": model,
	"model_source": model_source,
	"progress_callback": progress_callback,
	}
	if mode:
	summarize_kwargs["mode"] = mode
	if focus and focus.strip():
	summarize_kwargs["focus"] = focus.strip()
	if max_length:
	summarize_kwargs["max_length"] = max_length
	if instructions and instructions.strip():
	summarize_kwargs["instructions"] = instructions.strip()

	result_df = catllm.summarize(**summarize_kwargs)

	processing_time = time.time() - start_time
	total_items = len(result_df)
	progress_bar.progress(1.0)
	status_text.text(f"Completed {total_items} items in {processing_time:.1f}s")

	# Replace temp paths with original filenames for PDF input
	if input_type_selected == "pdf" and 'pdf_path' in result_df.columns:
	pdf_name_map = st.session_state.get('pdf_name_map', {})
	def replace_temp_path(val):
	if pd.isna(val):
	return val
	val_str = str(val)
	for temp_path, orig_name in pdf_name_map.items():
	if temp_path in val_str:
	return val_str.replace(temp_path, orig_name + '.pdf')
	return val_str
	result_df['pdf_path'] = result_df['pdf_path'].apply(replace_temp_path)

	# Save CSV
	with tempfile.NamedTemporaryFile(mode='w', suffix='_summarized.csv', delete=False) as f:
	result_df.to_csv(f.name, index=False)
	csv_path = f.name

	# Calculate success rate
	if 'processing_status' in result_df.columns:
	success_count = (result_df['processing_status'] == 'success').sum()
	success_rate = (success_count / len(result_df)) * 100
	else:
	success_rate = 100.0

	# Get version info
	try:
	catllm_version = catllm.__version__
	except AttributeError:
	catllm_version = "unknown"
	python_version = sys.version.split()[0]

	# Generate methodology report
	pdf_path = generate_methodology_report_pdf(
	model=model,
	column_name=description,
	num_rows=total_items,
	model_source=model_source,
	filename=original_filename,
	success_rate=success_rate,
	result_df=result_df,
	processing_time=processing_time,
	catllm_version=catllm_version,
	python_version=python_version,
	input_type=input_type_selected,
	description=description,
	focus=focus if focus else None,
	max_length=max_length
	)

	# Generate code
	code = generate_summarize_code(
	input_type_selected, description, model, model_source,
	focus=focus if focus else None,
	max_length=max_length,
	instructions=instructions if instructions else None,
	mode=mode
	)

	st.session_state.results = {
	'df': result_df,
	'csv_path': csv_path,
	'pdf_path': pdf_path,
	'code': code,
	'status': f"Summarized {total_items} items in {processing_time:.1f}s",
	}
	st.success(f"Summarized {total_items} items in {processing_time:.1f}s")
	st.rerun()

	except Exception as e:
	st.error(f"Error: {str(e)}")

	with col_output:
	st.markdown("### Results")

	if st.session_state.results:
	results = st.session_state.results

	# Placeholder for future chart
	st.info("Summary visualization coming soon!")

	# Results dataframe
	display_df = results['df'].copy()
	cols_to_hide = ['model_response', 'json', 'raw_response', 'raw_json']
	display_df = display_df.drop(columns=[c for c in cols_to_hide if c in display_df.columns])
	st.dataframe(display_df, use_container_width=True)

	# Downloads
	col_dl1, col_dl2 = st.columns(2)
	with col_dl1:
	with open(results['csv_path'], 'rb') as f:
	st.download_button(
	"Download Results (CSV)",
	data=f,
	file_name="summarized_results.csv",
	mime="text/csv"
	)
	with col_dl2:
	with open(results['pdf_path'], 'rb') as f:
	st.download_button(
	"Download Methodology Report (PDF)",
	data=f,
	file_name="methodology_report.pdf",
	mime="application/pdf"
	)

	# Code
	with st.expander("See the Code"):
	st.code(results['code'], language='python')
	else:
	st.info("Upload data and click 'Summarize Data' to see results here.")

	# Bottom buttons
	col_reset, col_code = st.columns(2)
	with col_reset:
	if st.button("Reset", type="secondary", use_container_width=True):
	st.session_state.results = None
	if hasattr(st.session_state, 'example_loaded'):
	del st.session_state.example_loaded
	st.rerun()

	with col_code:
	if st.session_state.results:
	if st.button("See in Code", use_container_width=True):
	st.session_state.show_code_modal = True

	# Code modal/dialog
	if st.session_state.get('show_code_modal') and st.session_state.results:
	st.markdown("---")
	st.markdown("### Reproducibility Code")
	st.markdown("Use this code to reproduce the summarization with the CatLLM Python package:")
	st.code(st.session_state.results['code'], language='python')
	if st.button("Close"):
	st.session_state.show_code_modal = False
	st.rerun()