# ============================================ # Imports # ============================================ import gradio as gr import pandas as pd import time from pathlib import Path import yaml from typing import List, Tuple, Optional import re from gpt4all import GPT4All from huggingface_hub import hf_hub_download # ============================================ # Configuration Loading # ============================================ with open('_config.yaml', 'r') as f: config = yaml.safe_load(f) # Load defaults default_config = config.get('defaults', {}) prompts_config = config.get('prompts', {}) title_config = config.get('title_generation', {}) # Get prompts bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.') title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.') # Model selection summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k') title_model_alias = default_config.get('title', 'notes') # ============================================ # Model Definitions # ============================================ models_config = { 'summary': { 'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF', 'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf', 'local_dir': 'models', 'template': { 'prefix': '<|im_start|>user\n', 'suffix': ' <|im_end|>\n<|im_start|>assistant\n', 'stop_tokens': ['<|im_start|>', '<|im_end|>'] }, 'params': { 'num_ctx': 8000, 'num_gpu': -1, # CPU only 'num_predict': 4000 } }, 'title': { 'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF', 'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf', 'local_dir': 'models', 'template': { 'prefix': '[INST] ', 'suffix': ' [/INST]', 'stop_tokens': [''] }, 'params': { 'num_ctx': 8000, 'num_gpu': -1, 'num_predict': 100 # Shorter for titles } } } # ============================================ # Model Initialization # ============================================ print("Downloading and initializing models...") # Download models for model_type in ['summary', 'title']: cfg = models_config[model_type] print(f"Downloading {model_type} model...") hf_hub_download( repo_id=cfg['repo_id'], filename=cfg['filename'], local_dir=cfg['local_dir'], local_dir_use_symlinks=False ) # Initialize models print("Initializing summary model...") summary_model = GPT4All( model_name=models_config['summary']['filename'], model_path=models_config['summary']['local_dir'], allow_download=False, device="cpu" ) print("Initializing title model...") title_model = GPT4All( model_name=models_config['title']['filename'], model_path=models_config['title']['local_dir'], allow_download=False, device="cpu" ) # Configure models # Summary model uses custom template from modelfile summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix'] summary_model.config["systemPrompt"] = "" # Title model uses Mistral instruct format title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix'] title_model.config["systemPrompt"] = "" print("Models initialized successfully!") # ============================================ # Text Processing Functions # ============================================ def sanitize_text(text: str) -> str: """Clean text for processing.""" return text.strip() def bold_text_before_colon(text: str) -> str: """Bold any text before the first colon that isn't already bolded.""" pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):' replacement = r'\1**\2:**' return re.sub(pattern, replacement, text, flags=re.MULTILINE) def generate_title(text: str, temperature: float = 0.3) -> str: """Generate a title for the given text.""" prompt = f"```{text[:500]}```\n\n{title_prompt}" # Use title model with Mistral instruct format full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix'] outputs = [] for token in title_model.generate( prompt=full_prompt, temp=temperature, top_k=40, top_p=0.95, max_tokens=100, streaming=True ): outputs.append(token) title = "".join(outputs).strip() # Clean up the title (remove any remaining tags or unwanted characters) title = re.sub(r'^.*?\[/INST\]\s*', '', title) # Remove [/INST] and anything before it title = re.sub(r'\s+', ' ', title) # Normalize whitespace return title[:150] # Limit to 150 chars def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str: """Generate bulleted notes summary.""" prompt = f"```{text}```\n\n{bnotes_prompt}" # Use custom template from modelfile full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix'] outputs = [] for token in summary_model.generate( prompt=full_prompt, temp=temperature, top_k=40, top_p=0.95, max_tokens=max_tokens, streaming=True ): outputs.append(token) summary = "".join(outputs).strip() # Clean up the response summary = re.sub(r'^.*?assistant\s*', '', summary) # Remove "assistant" prefix summary = bold_text_before_colon(summary) return summary # ============================================ # Processing Functions # ============================================ def process_csv( file_obj, use_existing_titles: bool = True, generate_missing_titles: bool = True, temperature: float = 0.5, title_temperature: float = 0.3 ): """Process CSV file with title and text columns.""" # Read CSV try: df = pd.read_csv(file_obj.name) except Exception as e: return None, f"Error reading CSV: {str(e)}" # Check required columns if 'text' not in df.columns: return None, "CSV must contain 'text' column" # Prepare output DataFrame output_rows = [] # Process each row for idx, row in df.iterrows(): text = str(row.get('text', '')) original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else '' # Skip empty text if not text.strip(): continue # Generate or use title start_time = time.time() if original_title and use_existing_titles: title = original_title title_generated = False elif generate_missing_titles: title = generate_title(text, temperature=title_temperature) title_generated = True else: title = f"Text_{idx+1}" title_generated = False # Generate summary summary = generate_summary(text, temperature=temperature) end_time = time.time() # Calculate metrics elapsed_time = end_time - start_time # Prepare output row output_row = { 'title': title, 'text': text, 'text.len': len(text), 'output': summary, 'output.len': len(summary), 'time': elapsed_time } # Add original title if it exists if original_title and use_existing_titles: output_row['original_title'] = original_title output_row['title_generated'] = title_generated output_rows.append(output_row) # Yield intermediate progress yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..." # Create final DataFrame output_df = pd.DataFrame(output_rows) return output_df, f"Processing complete! Processed {len(output_df)} rows." def format_for_display(df): """Format DataFrame for nice display.""" if df is None or len(df) == 0: return pd.DataFrame() display_df = df.copy() # Truncate long columns for display if 'text' in display_df.columns: display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) if 'output' in display_df.columns: display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) # Format time column if 'time' in display_df.columns: display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s") # Reorder columns for display display_order = ['title', 'text.len', 'output.len', 'time'] display_order = [col for col in display_order if col in display_df.columns] # Add remaining columns other_cols = [col for col in display_df.columns if col not in display_order] display_order.extend(other_cols) return display_df[display_order] # ============================================ # Gradio Interface # ============================================ title = "Mistral-7B Text Summarizer with Title Generation" description = """ Process CSV files with text content and generate: 1. Titles (using Mistral-7B-Instruct-v0.2) 2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes) CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles. """ with gr.Blocks(title=title, css=""" .output-table { max-height: 500px; overflow-y: auto; } .progress-text { color: #666; font-style: italic; } """) as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): with gr.Column(scale=1): # Input Section gr.Markdown("## Input Settings") file_input = gr.File( label="Upload CSV File", file_types=[".csv"], type="file" ) use_existing_titles = gr.Checkbox( label="Use existing titles from CSV", value=True, info="If unchecked, will generate titles for all rows" ) generate_missing_titles = gr.Checkbox( label="Generate titles for missing rows", value=True, info="Generate titles only when 'title' column is empty" ) temperature = gr.Slider( label="Summary Temperature", value=0.5, minimum=0.0, maximum=1.0, step=0.05, info="Higher values = more creative, lower = more deterministic" ) title_temperature = gr.Slider( label="Title Temperature", value=0.3, minimum=0.0, maximum=1.0, step=0.05, info="Temperature for title generation" ) process_btn = gr.Button("Process CSV", variant="primary") with gr.Column(scale=2): # Output Section gr.Markdown("## Results") progress_text = gr.Textbox( label="Progress", value="Ready to process...", interactive=False ) display_df = gr.Dataframe( label="Preview", headers=[], datatype=["str", "str", "number", "number", "str"], row_count=5, col_count=(5, "fixed"), wrap=True, elem_classes=["output-table"] ) download_csv = gr.File(label="Download Full Results") # Event handlers def update_preview(df, message): """Update the preview display.""" display_df = format_for_display(df) return display_df, message def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp): """Process CSV and yield incremental updates.""" if file_obj is None: yield None, "Please upload a CSV file", None results_df = None for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp): if df_chunk is not None: results_df = df_chunk yield format_for_display(df_chunk), progress_msg, None if results_df is not None: # Save to temporary file for download output_path = "processed_output.csv" results_df.to_csv(output_path, index=False) yield format_for_display(results_df), "Processing complete!", output_path # Connect events process_btn.click( fn=process_and_update, inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature], outputs=[display_df, progress_text, download_csv] ) # Update preview when file is uploaded def on_file_upload(file_obj): if file_obj is None: return pd.DataFrame(), "No file uploaded" try: df = pd.read_csv(file_obj.name) preview_df = format_for_display(df.head(5)) info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}" return preview_df, info except Exception as e: return pd.DataFrame(), f"Error loading file: {str(e)}" file_input.change( fn=on_file_upload, inputs=[file_input], outputs=[display_df, progress_text] ) # ============================================ # Launch Application # ============================================ if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )