Spaces:

cognitivetech
/

mistral-bulleted-notes

Running

App Files Files Community

cognitivetech commited on Dec 19, 2025

Commit

d09c502

verified ·

1 Parent(s): f613100

Create app.py

Browse files

Files changed (1) hide show

app.py +424 -0

app.py ADDED Viewed

	@@ -0,0 +1,424 @@

+# ============================================
+# Imports
+# ============================================
+import gradio as gr
+import pandas as pd
+import time
+from pathlib import Path
+import yaml
+from typing import List, Tuple, Optional
+import re
+from gpt4all import GPT4All
+from huggingface_hub import hf_hub_download
+# ============================================
+# Configuration Loading
+# ============================================
+with open('_config.yaml', 'r') as f:
+    config = yaml.safe_load(f)
+# Load defaults
+default_config = config.get('defaults', {})
+prompts_config = config.get('prompts', {})
+title_config = config.get('title_generation', {})
+# Get prompts
+bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.')
+title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.')
+# Model selection
+summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k')
+title_model_alias = default_config.get('title', 'notes')
+# ============================================
+# Model Definitions
+# ============================================
+models_config = {
+    'summary': {
+        'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF',
+        'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf',
+        'local_dir': 'models',
+        'template': {
+            'prefix': '<|im_start|>user\n',
+            'suffix': ' <|im_end|>\n<|im_start|>assistant\n',
+            'stop_tokens': ['<|im_start|>', '<|im_end|>']
+        },
+        'params': {
+            'num_ctx': 8000,
+            'num_gpu': -1,  # CPU only
+            'num_predict': 4000
+        }
+    },
+    'title': {
+        'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF',
+        'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf',
+        'local_dir': 'models',
+        'template': {
+            'prefix': '<s>[INST] ',
+            'suffix': ' [/INST]',
+            'stop_tokens': ['</s>']
+        },
+        'params': {
+            'num_ctx': 8000,
+            'num_gpu': -1,
+            'num_predict': 100  # Shorter for titles
+        }
+    }
+}
+# ============================================
+# Model Initialization
+# ============================================
+print("Downloading and initializing models...")
+# Download models
+for model_type in ['summary', 'title']:
+    cfg = models_config[model_type]
+    print(f"Downloading {model_type} model...")
+    hf_hub_download(
+        repo_id=cfg['repo_id'],
+        filename=cfg['filename'],
+        local_dir=cfg['local_dir'],
+        local_dir_use_symlinks=False
+    )
+# Initialize models
+print("Initializing summary model...")
+summary_model = GPT4All(
+    model_name=models_config['summary']['filename'],
+    model_path=models_config['summary']['local_dir'],
+    allow_download=False,
+    device="cpu"
+)
+print("Initializing title model...")
+title_model = GPT4All(
+    model_name=models_config['title']['filename'],
+    model_path=models_config['title']['local_dir'],
+    allow_download=False,
+    device="cpu"
+)
+# Configure models
+# Summary model uses custom template from modelfile
+summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix']
+summary_model.config["systemPrompt"] = ""
+# Title model uses Mistral instruct format
+title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix']
+title_model.config["systemPrompt"] = ""
+print("Models initialized successfully!")
+# ============================================
+# Text Processing Functions
+# ============================================
+def sanitize_text(text: str) -> str:
+    """Clean text for processing."""
+    return text.strip()
+def bold_text_before_colon(text: str) -> str:
+    """Bold any text before the first colon that isn't already bolded."""
+    pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):'
+    replacement = r'\1**\2:**'
+    return re.sub(pattern, replacement, text, flags=re.MULTILINE)
+def generate_title(text: str, temperature: float = 0.3) -> str:
+    """Generate a title for the given text."""
+    prompt = f"```{text[:500]}```\n\n{title_prompt}"
+    # Use title model with Mistral instruct format
+    full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix']
+    outputs = []
+    for token in title_model.generate(
+        prompt=full_prompt,
+        temp=temperature,
+        top_k=40,
+        top_p=0.95,
+        max_tokens=100,
+        streaming=True
+    ):
+        outputs.append(token)
+    title = "".join(outputs).strip()
+    # Clean up the title (remove any remaining tags or unwanted characters)
+    title = re.sub(r'^.*?\[/INST\]\s*', '', title)  # Remove [/INST] and anything before it
+    title = re.sub(r'\s+', ' ', title)  # Normalize whitespace
+    return title[:150]  # Limit to 150 chars
+def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str:
+    """Generate bulleted notes summary."""
+    prompt = f"```{text}```\n\n{bnotes_prompt}"
+    # Use custom template from modelfile
+    full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix']
+    outputs = []
+    for token in summary_model.generate(
+        prompt=full_prompt,
+        temp=temperature,
+        top_k=40,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        streaming=True
+    ):
+        outputs.append(token)
+    summary = "".join(outputs).strip()
+    # Clean up the response
+    summary = re.sub(r'^.*?assistant\s*', '', summary)  # Remove "assistant" prefix
+    summary = bold_text_before_colon(summary)
+    return summary
+# ============================================
+# Processing Functions
+# ============================================
+def process_csv(
+    file_obj,
+    use_existing_titles: bool = True,
+    generate_missing_titles: bool = True,
+    temperature: float = 0.5,
+    title_temperature: float = 0.3
+):
+    """Process CSV file with title and text columns."""
+    # Read CSV
+    try:
+        df = pd.read_csv(file_obj.name)
+    except Exception as e:
+        return None, f"Error reading CSV: {str(e)}"
+    # Check required columns
+    if 'text' not in df.columns:
+        return None, "CSV must contain 'text' column"
+    # Prepare output DataFrame
+    output_rows = []
+    # Process each row
+    for idx, row in df.iterrows():
+        text = str(row.get('text', ''))
+        original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else ''
+        # Skip empty text
+        if not text.strip():
+            continue
+        # Generate or use title
+        start_time = time.time()
+        if original_title and use_existing_titles:
+            title = original_title
+            title_generated = False
+        elif generate_missing_titles:
+            title = generate_title(text, temperature=title_temperature)
+            title_generated = True
+        else:
+            title = f"Text_{idx+1}"
+            title_generated = False
+        # Generate summary
+        summary = generate_summary(text, temperature=temperature)
+        end_time = time.time()
+        # Calculate metrics
+        elapsed_time = end_time - start_time
+        # Prepare output row
+        output_row = {
+            'title': title,
+            'text': text,
+            'text.len': len(text),
+            'output': summary,
+            'output.len': len(summary),
+            'time': elapsed_time
+        }
+        # Add original title if it exists
+        if original_title and use_existing_titles:
+            output_row['original_title'] = original_title
+            output_row['title_generated'] = title_generated
+        output_rows.append(output_row)
+        # Yield intermediate progress
+        yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..."
+    # Create final DataFrame
+    output_df = pd.DataFrame(output_rows)
+    return output_df, f"Processing complete! Processed {len(output_df)} rows."
+def format_for_display(df):
+    """Format DataFrame for nice display."""
+    if df is None or len(df) == 0:
+        return pd.DataFrame()
+    display_df = df.copy()
+    # Truncate long columns for display
+    if 'text' in display_df.columns:
+        display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
+    if 'output' in display_df.columns:
+        display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
+    # Format time column
+    if 'time' in display_df.columns:
+        display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s")
+    # Reorder columns for display
+    display_order = ['title', 'text.len', 'output.len', 'time']
+    display_order = [col for col in display_order if col in display_df.columns]
+    # Add remaining columns
+    other_cols = [col for col in display_df.columns if col not in display_order]
+    display_order.extend(other_cols)
+    return display_df[display_order]
+# ============================================
+# Gradio Interface
+# ============================================
+title = "Mistral-7B Text Summarizer with Title Generation"
+description = """
+Process CSV files with text content and generate:
+1. Titles (using Mistral-7B-Instruct-v0.2)
+2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes)
+CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles.
+"""
+with gr.Blocks(title=title, css="""
+    .output-table { max-height: 500px; overflow-y: auto; }
+    .progress-text { color: #666; font-style: italic; }
+""") as demo:
+    gr.Markdown(f"# {title}")
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input Section
+            gr.Markdown("## Input Settings")
+            file_input = gr.File(
+                label="Upload CSV File",
+                file_types=[".csv"],
+                type="file"
+            )
+            use_existing_titles = gr.Checkbox(
+                label="Use existing titles from CSV",
+                value=True,
+                info="If unchecked, will generate titles for all rows"
+            )
+            generate_missing_titles = gr.Checkbox(
+                label="Generate titles for missing rows",
+                value=True,
+                info="Generate titles only when 'title' column is empty"
+            )
+            temperature = gr.Slider(
+                label="Summary Temperature",
+                value=0.5,
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                info="Higher values = more creative, lower = more deterministic"
+            )
+            title_temperature = gr.Slider(
+                label="Title Temperature",
+                value=0.3,
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                info="Temperature for title generation"
+            )
+            process_btn = gr.Button("Process CSV", variant="primary")
+        with gr.Column(scale=2):
+            # Output Section
+            gr.Markdown("## Results")
+            progress_text = gr.Textbox(
+                label="Progress",
+                value="Ready to process...",
+                interactive=False
+            )
+            display_df = gr.Dataframe(
+                label="Preview",
+                headers=[],
+                datatype=["str", "str", "number", "number", "str"],
+                row_count=5,
+                col_count=(5, "fixed"),
+                wrap=True,
+                elem_classes=["output-table"]
+            )
+            download_csv = gr.File(label="Download Full Results")
+    # Event handlers
+    def update_preview(df, message):
+        """Update the preview display."""
+        display_df = format_for_display(df)
+        return display_df, message
+    def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp):
+        """Process CSV and yield incremental updates."""
+        if file_obj is None:
+            yield None, "Please upload a CSV file", None
+        results_df = None
+        for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp):
+            if df_chunk is not None:
+                results_df = df_chunk
+                yield format_for_display(df_chunk), progress_msg, None
+        if results_df is not None:
+            # Save to temporary file for download
+            output_path = "processed_output.csv"
+            results_df.to_csv(output_path, index=False)
+            yield format_for_display(results_df), "Processing complete!", output_path
+    # Connect events
+    process_btn.click(
+        fn=process_and_update,
+        inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature],
+        outputs=[display_df, progress_text, download_csv]
+    )
+    # Update preview when file is uploaded
+    def on_file_upload(file_obj):
+        if file_obj is None:
+            return pd.DataFrame(), "No file uploaded"
+        try:
+            df = pd.read_csv(file_obj.name)
+            preview_df = format_for_display(df.head(5))
+            info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}"
+            return preview_df, info
+        except Exception as e:
+            return pd.DataFrame(), f"Error loading file: {str(e)}"
+    file_input.change(
+        fn=on_file_upload,
+        inputs=[file_input],
+        outputs=[display_df, progress_text]
+    )
+# ============================================
+# Launch Application
+# ============================================
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )