Spaces:

cognitivetech
/

mistral-bulleted-notes

Running

File size: 14,338 Bytes

d09c502

# ============================================
# Imports
# ============================================
import gradio as gr
import pandas as pd
import time
from pathlib import Path
import yaml
from typing import List, Tuple, Optional
import re
from gpt4all import GPT4All
from huggingface_hub import hf_hub_download

# ============================================
# Configuration Loading
# ============================================
with open('_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load defaults
default_config = config.get('defaults', {})
prompts_config = config.get('prompts', {})
title_config = config.get('title_generation', {})

# Get prompts
bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.')
title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.')

# Model selection
summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k')
title_model_alias = default_config.get('title', 'notes')

# ============================================
# Model Definitions
# ============================================
models_config = {
    'summary': {
        'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF',
        'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf',
        'local_dir': 'models',
        'template': {
            'prefix': '<|im_start|>user\n',
            'suffix': ' <|im_end|>\n<|im_start|>assistant\n',
            'stop_tokens': ['<|im_start|>', '<|im_end|>']
        },
        'params': {
            'num_ctx': 8000,
            'num_gpu': -1,  # CPU only
            'num_predict': 4000
        }
    },
    'title': {
        'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF',
        'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf',
        'local_dir': 'models',
        'template': {
            'prefix': '<s>[INST] ',
            'suffix': ' [/INST]',
            'stop_tokens': ['</s>']
        },
        'params': {
            'num_ctx': 8000,
            'num_gpu': -1,
            'num_predict': 100  # Shorter for titles
        }
    }
}

# ============================================
# Model Initialization
# ============================================
print("Downloading and initializing models...")

# Download models
for model_type in ['summary', 'title']:
    cfg = models_config[model_type]
    print(f"Downloading {model_type} model...")
    hf_hub_download(
        repo_id=cfg['repo_id'],
        filename=cfg['filename'],
        local_dir=cfg['local_dir'],
        local_dir_use_symlinks=False
    )

# Initialize models
print("Initializing summary model...")
summary_model = GPT4All(
    model_name=models_config['summary']['filename'],
    model_path=models_config['summary']['local_dir'],
    allow_download=False,
    device="cpu"
)

print("Initializing title model...")
title_model = GPT4All(
    model_name=models_config['title']['filename'],
    model_path=models_config['title']['local_dir'],
    allow_download=False,
    device="cpu"
)

# Configure models
# Summary model uses custom template from modelfile
summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix']
summary_model.config["systemPrompt"] = ""

# Title model uses Mistral instruct format
title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix']
title_model.config["systemPrompt"] = ""

print("Models initialized successfully!")

# ============================================
# Text Processing Functions
# ============================================
def sanitize_text(text: str) -> str:
    """Clean text for processing."""
    return text.strip()

def bold_text_before_colon(text: str) -> str:
    """Bold any text before the first colon that isn't already bolded."""
    pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):'
    replacement = r'\1**\2:**'
    return re.sub(pattern, replacement, text, flags=re.MULTILINE)

def generate_title(text: str, temperature: float = 0.3) -> str:
    """Generate a title for the given text."""
    prompt = f"```{text[:500]}```\n\n{title_prompt}"
    
    # Use title model with Mistral instruct format
    full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix']
    
    outputs = []
    for token in title_model.generate(
        prompt=full_prompt,
        temp=temperature,
        top_k=40,
        top_p=0.95,
        max_tokens=100,
        streaming=True
    ):
        outputs.append(token)
    
    title = "".join(outputs).strip()
    # Clean up the title (remove any remaining tags or unwanted characters)
    title = re.sub(r'^.*?\[/INST\]\s*', '', title)  # Remove [/INST] and anything before it
    title = re.sub(r'\s+', ' ', title)  # Normalize whitespace
    return title[:150]  # Limit to 150 chars

def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str:
    """Generate bulleted notes summary."""
    prompt = f"```{text}```\n\n{bnotes_prompt}"
    
    # Use custom template from modelfile
    full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix']
    
    outputs = []
    for token in summary_model.generate(
        prompt=full_prompt,
        temp=temperature,
        top_k=40,
        top_p=0.95,
        max_tokens=max_tokens,
        streaming=True
    ):
        outputs.append(token)
    
    summary = "".join(outputs).strip()
    # Clean up the response
    summary = re.sub(r'^.*?assistant\s*', '', summary)  # Remove "assistant" prefix
    summary = bold_text_before_colon(summary)
    return summary

# ============================================
# Processing Functions
# ============================================
def process_csv(
    file_obj,
    use_existing_titles: bool = True,
    generate_missing_titles: bool = True,
    temperature: float = 0.5,
    title_temperature: float = 0.3
):
    """Process CSV file with title and text columns."""
    
    # Read CSV
    try:
        df = pd.read_csv(file_obj.name)
    except Exception as e:
        return None, f"Error reading CSV: {str(e)}"
    
    # Check required columns
    if 'text' not in df.columns:
        return None, "CSV must contain 'text' column"
    
    # Prepare output DataFrame
    output_rows = []
    
    # Process each row
    for idx, row in df.iterrows():
        text = str(row.get('text', ''))
        original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else ''
        
        # Skip empty text
        if not text.strip():
            continue
        
        # Generate or use title
        start_time = time.time()
        
        if original_title and use_existing_titles:
            title = original_title
            title_generated = False
        elif generate_missing_titles:
            title = generate_title(text, temperature=title_temperature)
            title_generated = True
        else:
            title = f"Text_{idx+1}"
            title_generated = False
        
        # Generate summary
        summary = generate_summary(text, temperature=temperature)
        end_time = time.time()
        
        # Calculate metrics
        elapsed_time = end_time - start_time
        
        # Prepare output row
        output_row = {
            'title': title,
            'text': text,
            'text.len': len(text),
            'output': summary,
            'output.len': len(summary),
            'time': elapsed_time
        }
        
        # Add original title if it exists
        if original_title and use_existing_titles:
            output_row['original_title'] = original_title
            output_row['title_generated'] = title_generated
        
        output_rows.append(output_row)
        
        # Yield intermediate progress
        yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..."
    
    # Create final DataFrame
    output_df = pd.DataFrame(output_rows)
    
    return output_df, f"Processing complete! Processed {len(output_df)} rows."

def format_for_display(df):
    """Format DataFrame for nice display."""
    if df is None or len(df) == 0:
        return pd.DataFrame()
    
    display_df = df.copy()
    
    # Truncate long columns for display
    if 'text' in display_df.columns:
        display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
    
    if 'output' in display_df.columns:
        display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
    
    # Format time column
    if 'time' in display_df.columns:
        display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s")
    
    # Reorder columns for display
    display_order = ['title', 'text.len', 'output.len', 'time']
    display_order = [col for col in display_order if col in display_df.columns]
    
    # Add remaining columns
    other_cols = [col for col in display_df.columns if col not in display_order]
    display_order.extend(other_cols)
    
    return display_df[display_order]

# ============================================
# Gradio Interface
# ============================================
title = "Mistral-7B Text Summarizer with Title Generation"
description = """
Process CSV files with text content and generate:
1. Titles (using Mistral-7B-Instruct-v0.2)
2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes)

CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles.
"""

with gr.Blocks(title=title, css="""
    .output-table { max-height: 500px; overflow-y: auto; }
    .progress-text { color: #666; font-style: italic; }
""") as demo:
    
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    
    with gr.Row():
        with gr.Column(scale=1):
            # Input Section
            gr.Markdown("## Input Settings")
            
            file_input = gr.File(
                label="Upload CSV File",
                file_types=[".csv"],
                type="file"
            )
            
            use_existing_titles = gr.Checkbox(
                label="Use existing titles from CSV",
                value=True,
                info="If unchecked, will generate titles for all rows"
            )
            
            generate_missing_titles = gr.Checkbox(
                label="Generate titles for missing rows",
                value=True,
                info="Generate titles only when 'title' column is empty"
            )
            
            temperature = gr.Slider(
                label="Summary Temperature",
                value=0.5,
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                info="Higher values = more creative, lower = more deterministic"
            )
            
            title_temperature = gr.Slider(
                label="Title Temperature",
                value=0.3,
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                info="Temperature for title generation"
            )
            
            process_btn = gr.Button("Process CSV", variant="primary")
        
        with gr.Column(scale=2):
            # Output Section
            gr.Markdown("## Results")
            
            progress_text = gr.Textbox(
                label="Progress",
                value="Ready to process...",
                interactive=False
            )
            
            display_df = gr.Dataframe(
                label="Preview",
                headers=[],
                datatype=["str", "str", "number", "number", "str"],
                row_count=5,
                col_count=(5, "fixed"),
                wrap=True,
                elem_classes=["output-table"]
            )
            
            download_csv = gr.File(label="Download Full Results")
    
    # Event handlers
    def update_preview(df, message):
        """Update the preview display."""
        display_df = format_for_display(df)
        return display_df, message
    
    def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp):
        """Process CSV and yield incremental updates."""
        if file_obj is None:
            yield None, "Please upload a CSV file", None
        
        results_df = None
        for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp):
            if df_chunk is not None:
                results_df = df_chunk
                yield format_for_display(df_chunk), progress_msg, None
        
        if results_df is not None:
            # Save to temporary file for download
            output_path = "processed_output.csv"
            results_df.to_csv(output_path, index=False)
            yield format_for_display(results_df), "Processing complete!", output_path
    
    # Connect events
    process_btn.click(
        fn=process_and_update,
        inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature],
        outputs=[display_df, progress_text, download_csv]
    )
    
    # Update preview when file is uploaded
    def on_file_upload(file_obj):
        if file_obj is None:
            return pd.DataFrame(), "No file uploaded"
        
        try:
            df = pd.read_csv(file_obj.name)
            preview_df = format_for_display(df.head(5))
            info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}"
            return preview_df, info
        except Exception as e:
            return pd.DataFrame(), f"Error loading file: {str(e)}"
    
    file_input.change(
        fn=on_file_upload,
        inputs=[file_input],
        outputs=[display_df, progress_text]
    )

# ============================================
# Launch Application
# ============================================
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
    )