cognitivetech's picture
Create app.py
d09c502 verified
# ============================================
# Imports
# ============================================
import gradio as gr
import pandas as pd
import time
from pathlib import Path
import yaml
from typing import List, Tuple, Optional
import re
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download
# ============================================
# Configuration Loading
# ============================================
with open('_config.yaml', 'r') as f:
config = yaml.safe_load(f)
# Load defaults
default_config = config.get('defaults', {})
prompts_config = config.get('prompts', {})
title_config = config.get('title_generation', {})
# Get prompts
bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.')
title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.')
# Model selection
summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k')
title_model_alias = default_config.get('title', 'notes')
# ============================================
# Model Definitions
# ============================================
models_config = {
'summary': {
'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF',
'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf',
'local_dir': 'models',
'template': {
'prefix': '<|im_start|>user\n',
'suffix': ' <|im_end|>\n<|im_start|>assistant\n',
'stop_tokens': ['<|im_start|>', '<|im_end|>']
},
'params': {
'num_ctx': 8000,
'num_gpu': -1, # CPU only
'num_predict': 4000
}
},
'title': {
'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF',
'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf',
'local_dir': 'models',
'template': {
'prefix': '<s>[INST] ',
'suffix': ' [/INST]',
'stop_tokens': ['</s>']
},
'params': {
'num_ctx': 8000,
'num_gpu': -1,
'num_predict': 100 # Shorter for titles
}
}
}
# ============================================
# Model Initialization
# ============================================
print("Downloading and initializing models...")
# Download models
for model_type in ['summary', 'title']:
cfg = models_config[model_type]
print(f"Downloading {model_type} model...")
hf_hub_download(
repo_id=cfg['repo_id'],
filename=cfg['filename'],
local_dir=cfg['local_dir'],
local_dir_use_symlinks=False
)
# Initialize models
print("Initializing summary model...")
summary_model = GPT4All(
model_name=models_config['summary']['filename'],
model_path=models_config['summary']['local_dir'],
allow_download=False,
device="cpu"
)
print("Initializing title model...")
title_model = GPT4All(
model_name=models_config['title']['filename'],
model_path=models_config['title']['local_dir'],
allow_download=False,
device="cpu"
)
# Configure models
# Summary model uses custom template from modelfile
summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix']
summary_model.config["systemPrompt"] = ""
# Title model uses Mistral instruct format
title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix']
title_model.config["systemPrompt"] = ""
print("Models initialized successfully!")
# ============================================
# Text Processing Functions
# ============================================
def sanitize_text(text: str) -> str:
"""Clean text for processing."""
return text.strip()
def bold_text_before_colon(text: str) -> str:
"""Bold any text before the first colon that isn't already bolded."""
pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):'
replacement = r'\1**\2:**'
return re.sub(pattern, replacement, text, flags=re.MULTILINE)
def generate_title(text: str, temperature: float = 0.3) -> str:
"""Generate a title for the given text."""
prompt = f"```{text[:500]}```\n\n{title_prompt}"
# Use title model with Mistral instruct format
full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix']
outputs = []
for token in title_model.generate(
prompt=full_prompt,
temp=temperature,
top_k=40,
top_p=0.95,
max_tokens=100,
streaming=True
):
outputs.append(token)
title = "".join(outputs).strip()
# Clean up the title (remove any remaining tags or unwanted characters)
title = re.sub(r'^.*?\[/INST\]\s*', '', title) # Remove [/INST] and anything before it
title = re.sub(r'\s+', ' ', title) # Normalize whitespace
return title[:150] # Limit to 150 chars
def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str:
"""Generate bulleted notes summary."""
prompt = f"```{text}```\n\n{bnotes_prompt}"
# Use custom template from modelfile
full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix']
outputs = []
for token in summary_model.generate(
prompt=full_prompt,
temp=temperature,
top_k=40,
top_p=0.95,
max_tokens=max_tokens,
streaming=True
):
outputs.append(token)
summary = "".join(outputs).strip()
# Clean up the response
summary = re.sub(r'^.*?assistant\s*', '', summary) # Remove "assistant" prefix
summary = bold_text_before_colon(summary)
return summary
# ============================================
# Processing Functions
# ============================================
def process_csv(
file_obj,
use_existing_titles: bool = True,
generate_missing_titles: bool = True,
temperature: float = 0.5,
title_temperature: float = 0.3
):
"""Process CSV file with title and text columns."""
# Read CSV
try:
df = pd.read_csv(file_obj.name)
except Exception as e:
return None, f"Error reading CSV: {str(e)}"
# Check required columns
if 'text' not in df.columns:
return None, "CSV must contain 'text' column"
# Prepare output DataFrame
output_rows = []
# Process each row
for idx, row in df.iterrows():
text = str(row.get('text', ''))
original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else ''
# Skip empty text
if not text.strip():
continue
# Generate or use title
start_time = time.time()
if original_title and use_existing_titles:
title = original_title
title_generated = False
elif generate_missing_titles:
title = generate_title(text, temperature=title_temperature)
title_generated = True
else:
title = f"Text_{idx+1}"
title_generated = False
# Generate summary
summary = generate_summary(text, temperature=temperature)
end_time = time.time()
# Calculate metrics
elapsed_time = end_time - start_time
# Prepare output row
output_row = {
'title': title,
'text': text,
'text.len': len(text),
'output': summary,
'output.len': len(summary),
'time': elapsed_time
}
# Add original title if it exists
if original_title and use_existing_titles:
output_row['original_title'] = original_title
output_row['title_generated'] = title_generated
output_rows.append(output_row)
# Yield intermediate progress
yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..."
# Create final DataFrame
output_df = pd.DataFrame(output_rows)
return output_df, f"Processing complete! Processed {len(output_df)} rows."
def format_for_display(df):
"""Format DataFrame for nice display."""
if df is None or len(df) == 0:
return pd.DataFrame()
display_df = df.copy()
# Truncate long columns for display
if 'text' in display_df.columns:
display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
if 'output' in display_df.columns:
display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
# Format time column
if 'time' in display_df.columns:
display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s")
# Reorder columns for display
display_order = ['title', 'text.len', 'output.len', 'time']
display_order = [col for col in display_order if col in display_df.columns]
# Add remaining columns
other_cols = [col for col in display_df.columns if col not in display_order]
display_order.extend(other_cols)
return display_df[display_order]
# ============================================
# Gradio Interface
# ============================================
title = "Mistral-7B Text Summarizer with Title Generation"
description = """
Process CSV files with text content and generate:
1. Titles (using Mistral-7B-Instruct-v0.2)
2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes)
CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles.
"""
with gr.Blocks(title=title, css="""
.output-table { max-height: 500px; overflow-y: auto; }
.progress-text { color: #666; font-style: italic; }
""") as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
with gr.Row():
with gr.Column(scale=1):
# Input Section
gr.Markdown("## Input Settings")
file_input = gr.File(
label="Upload CSV File",
file_types=[".csv"],
type="file"
)
use_existing_titles = gr.Checkbox(
label="Use existing titles from CSV",
value=True,
info="If unchecked, will generate titles for all rows"
)
generate_missing_titles = gr.Checkbox(
label="Generate titles for missing rows",
value=True,
info="Generate titles only when 'title' column is empty"
)
temperature = gr.Slider(
label="Summary Temperature",
value=0.5,
minimum=0.0,
maximum=1.0,
step=0.05,
info="Higher values = more creative, lower = more deterministic"
)
title_temperature = gr.Slider(
label="Title Temperature",
value=0.3,
minimum=0.0,
maximum=1.0,
step=0.05,
info="Temperature for title generation"
)
process_btn = gr.Button("Process CSV", variant="primary")
with gr.Column(scale=2):
# Output Section
gr.Markdown("## Results")
progress_text = gr.Textbox(
label="Progress",
value="Ready to process...",
interactive=False
)
display_df = gr.Dataframe(
label="Preview",
headers=[],
datatype=["str", "str", "number", "number", "str"],
row_count=5,
col_count=(5, "fixed"),
wrap=True,
elem_classes=["output-table"]
)
download_csv = gr.File(label="Download Full Results")
# Event handlers
def update_preview(df, message):
"""Update the preview display."""
display_df = format_for_display(df)
return display_df, message
def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp):
"""Process CSV and yield incremental updates."""
if file_obj is None:
yield None, "Please upload a CSV file", None
results_df = None
for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp):
if df_chunk is not None:
results_df = df_chunk
yield format_for_display(df_chunk), progress_msg, None
if results_df is not None:
# Save to temporary file for download
output_path = "processed_output.csv"
results_df.to_csv(output_path, index=False)
yield format_for_display(results_df), "Processing complete!", output_path
# Connect events
process_btn.click(
fn=process_and_update,
inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature],
outputs=[display_df, progress_text, download_csv]
)
# Update preview when file is uploaded
def on_file_upload(file_obj):
if file_obj is None:
return pd.DataFrame(), "No file uploaded"
try:
df = pd.read_csv(file_obj.name)
preview_df = format_for_display(df.head(5))
info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}"
return preview_df, info
except Exception as e:
return pd.DataFrame(), f"Error loading file: {str(e)}"
file_input.change(
fn=on_file_upload,
inputs=[file_input],
outputs=[display_df, progress_text]
)
# ============================================
# Launch Application
# ============================================
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
)