|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import time |
|
|
from pathlib import Path |
|
|
import yaml |
|
|
from typing import List, Tuple, Optional |
|
|
import re |
|
|
from gpt4all import GPT4All |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('_config.yaml', 'r') as f: |
|
|
config = yaml.safe_load(f) |
|
|
|
|
|
|
|
|
default_config = config.get('defaults', {}) |
|
|
prompts_config = config.get('prompts', {}) |
|
|
title_config = config.get('title_generation', {}) |
|
|
|
|
|
|
|
|
bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.') |
|
|
title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.') |
|
|
|
|
|
|
|
|
summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k') |
|
|
title_model_alias = default_config.get('title', 'notes') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models_config = { |
|
|
'summary': { |
|
|
'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF', |
|
|
'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf', |
|
|
'local_dir': 'models', |
|
|
'template': { |
|
|
'prefix': '<|im_start|>user\n', |
|
|
'suffix': ' <|im_end|>\n<|im_start|>assistant\n', |
|
|
'stop_tokens': ['<|im_start|>', '<|im_end|>'] |
|
|
}, |
|
|
'params': { |
|
|
'num_ctx': 8000, |
|
|
'num_gpu': -1, |
|
|
'num_predict': 4000 |
|
|
} |
|
|
}, |
|
|
'title': { |
|
|
'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF', |
|
|
'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf', |
|
|
'local_dir': 'models', |
|
|
'template': { |
|
|
'prefix': '<s>[INST] ', |
|
|
'suffix': ' [/INST]', |
|
|
'stop_tokens': ['</s>'] |
|
|
}, |
|
|
'params': { |
|
|
'num_ctx': 8000, |
|
|
'num_gpu': -1, |
|
|
'num_predict': 100 |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Downloading and initializing models...") |
|
|
|
|
|
|
|
|
for model_type in ['summary', 'title']: |
|
|
cfg = models_config[model_type] |
|
|
print(f"Downloading {model_type} model...") |
|
|
hf_hub_download( |
|
|
repo_id=cfg['repo_id'], |
|
|
filename=cfg['filename'], |
|
|
local_dir=cfg['local_dir'], |
|
|
local_dir_use_symlinks=False |
|
|
) |
|
|
|
|
|
|
|
|
print("Initializing summary model...") |
|
|
summary_model = GPT4All( |
|
|
model_name=models_config['summary']['filename'], |
|
|
model_path=models_config['summary']['local_dir'], |
|
|
allow_download=False, |
|
|
device="cpu" |
|
|
) |
|
|
|
|
|
print("Initializing title model...") |
|
|
title_model = GPT4All( |
|
|
model_name=models_config['title']['filename'], |
|
|
model_path=models_config['title']['local_dir'], |
|
|
allow_download=False, |
|
|
device="cpu" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix'] |
|
|
summary_model.config["systemPrompt"] = "" |
|
|
|
|
|
|
|
|
title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix'] |
|
|
title_model.config["systemPrompt"] = "" |
|
|
|
|
|
print("Models initialized successfully!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sanitize_text(text: str) -> str: |
|
|
"""Clean text for processing.""" |
|
|
return text.strip() |
|
|
|
|
|
def bold_text_before_colon(text: str) -> str: |
|
|
"""Bold any text before the first colon that isn't already bolded.""" |
|
|
pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):' |
|
|
replacement = r'\1**\2:**' |
|
|
return re.sub(pattern, replacement, text, flags=re.MULTILINE) |
|
|
|
|
|
def generate_title(text: str, temperature: float = 0.3) -> str: |
|
|
"""Generate a title for the given text.""" |
|
|
prompt = f"```{text[:500]}```\n\n{title_prompt}" |
|
|
|
|
|
|
|
|
full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix'] |
|
|
|
|
|
outputs = [] |
|
|
for token in title_model.generate( |
|
|
prompt=full_prompt, |
|
|
temp=temperature, |
|
|
top_k=40, |
|
|
top_p=0.95, |
|
|
max_tokens=100, |
|
|
streaming=True |
|
|
): |
|
|
outputs.append(token) |
|
|
|
|
|
title = "".join(outputs).strip() |
|
|
|
|
|
title = re.sub(r'^.*?\[/INST\]\s*', '', title) |
|
|
title = re.sub(r'\s+', ' ', title) |
|
|
return title[:150] |
|
|
|
|
|
def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str: |
|
|
"""Generate bulleted notes summary.""" |
|
|
prompt = f"```{text}```\n\n{bnotes_prompt}" |
|
|
|
|
|
|
|
|
full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix'] |
|
|
|
|
|
outputs = [] |
|
|
for token in summary_model.generate( |
|
|
prompt=full_prompt, |
|
|
temp=temperature, |
|
|
top_k=40, |
|
|
top_p=0.95, |
|
|
max_tokens=max_tokens, |
|
|
streaming=True |
|
|
): |
|
|
outputs.append(token) |
|
|
|
|
|
summary = "".join(outputs).strip() |
|
|
|
|
|
summary = re.sub(r'^.*?assistant\s*', '', summary) |
|
|
summary = bold_text_before_colon(summary) |
|
|
return summary |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_csv( |
|
|
file_obj, |
|
|
use_existing_titles: bool = True, |
|
|
generate_missing_titles: bool = True, |
|
|
temperature: float = 0.5, |
|
|
title_temperature: float = 0.3 |
|
|
): |
|
|
"""Process CSV file with title and text columns.""" |
|
|
|
|
|
|
|
|
try: |
|
|
df = pd.read_csv(file_obj.name) |
|
|
except Exception as e: |
|
|
return None, f"Error reading CSV: {str(e)}" |
|
|
|
|
|
|
|
|
if 'text' not in df.columns: |
|
|
return None, "CSV must contain 'text' column" |
|
|
|
|
|
|
|
|
output_rows = [] |
|
|
|
|
|
|
|
|
for idx, row in df.iterrows(): |
|
|
text = str(row.get('text', '')) |
|
|
original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else '' |
|
|
|
|
|
|
|
|
if not text.strip(): |
|
|
continue |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
if original_title and use_existing_titles: |
|
|
title = original_title |
|
|
title_generated = False |
|
|
elif generate_missing_titles: |
|
|
title = generate_title(text, temperature=title_temperature) |
|
|
title_generated = True |
|
|
else: |
|
|
title = f"Text_{idx+1}" |
|
|
title_generated = False |
|
|
|
|
|
|
|
|
summary = generate_summary(text, temperature=temperature) |
|
|
end_time = time.time() |
|
|
|
|
|
|
|
|
elapsed_time = end_time - start_time |
|
|
|
|
|
|
|
|
output_row = { |
|
|
'title': title, |
|
|
'text': text, |
|
|
'text.len': len(text), |
|
|
'output': summary, |
|
|
'output.len': len(summary), |
|
|
'time': elapsed_time |
|
|
} |
|
|
|
|
|
|
|
|
if original_title and use_existing_titles: |
|
|
output_row['original_title'] = original_title |
|
|
output_row['title_generated'] = title_generated |
|
|
|
|
|
output_rows.append(output_row) |
|
|
|
|
|
|
|
|
yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..." |
|
|
|
|
|
|
|
|
output_df = pd.DataFrame(output_rows) |
|
|
|
|
|
return output_df, f"Processing complete! Processed {len(output_df)} rows." |
|
|
|
|
|
def format_for_display(df): |
|
|
"""Format DataFrame for nice display.""" |
|
|
if df is None or len(df) == 0: |
|
|
return pd.DataFrame() |
|
|
|
|
|
display_df = df.copy() |
|
|
|
|
|
|
|
|
if 'text' in display_df.columns: |
|
|
display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) |
|
|
|
|
|
if 'output' in display_df.columns: |
|
|
display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x) |
|
|
|
|
|
|
|
|
if 'time' in display_df.columns: |
|
|
display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s") |
|
|
|
|
|
|
|
|
display_order = ['title', 'text.len', 'output.len', 'time'] |
|
|
display_order = [col for col in display_order if col in display_df.columns] |
|
|
|
|
|
|
|
|
other_cols = [col for col in display_df.columns if col not in display_order] |
|
|
display_order.extend(other_cols) |
|
|
|
|
|
return display_df[display_order] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "Mistral-7B Text Summarizer with Title Generation" |
|
|
description = """ |
|
|
Process CSV files with text content and generate: |
|
|
1. Titles (using Mistral-7B-Instruct-v0.2) |
|
|
2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes) |
|
|
|
|
|
CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles. |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title=title, css=""" |
|
|
.output-table { max-height: 500px; overflow-y: auto; } |
|
|
.progress-text { color: #666; font-style: italic; } |
|
|
""") as demo: |
|
|
|
|
|
gr.Markdown(f"# {title}") |
|
|
gr.Markdown(description) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
gr.Markdown("## Input Settings") |
|
|
|
|
|
file_input = gr.File( |
|
|
label="Upload CSV File", |
|
|
file_types=[".csv"], |
|
|
type="file" |
|
|
) |
|
|
|
|
|
use_existing_titles = gr.Checkbox( |
|
|
label="Use existing titles from CSV", |
|
|
value=True, |
|
|
info="If unchecked, will generate titles for all rows" |
|
|
) |
|
|
|
|
|
generate_missing_titles = gr.Checkbox( |
|
|
label="Generate titles for missing rows", |
|
|
value=True, |
|
|
info="Generate titles only when 'title' column is empty" |
|
|
) |
|
|
|
|
|
temperature = gr.Slider( |
|
|
label="Summary Temperature", |
|
|
value=0.5, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
step=0.05, |
|
|
info="Higher values = more creative, lower = more deterministic" |
|
|
) |
|
|
|
|
|
title_temperature = gr.Slider( |
|
|
label="Title Temperature", |
|
|
value=0.3, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
step=0.05, |
|
|
info="Temperature for title generation" |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("Process CSV", variant="primary") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
gr.Markdown("## Results") |
|
|
|
|
|
progress_text = gr.Textbox( |
|
|
label="Progress", |
|
|
value="Ready to process...", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
display_df = gr.Dataframe( |
|
|
label="Preview", |
|
|
headers=[], |
|
|
datatype=["str", "str", "number", "number", "str"], |
|
|
row_count=5, |
|
|
col_count=(5, "fixed"), |
|
|
wrap=True, |
|
|
elem_classes=["output-table"] |
|
|
) |
|
|
|
|
|
download_csv = gr.File(label="Download Full Results") |
|
|
|
|
|
|
|
|
def update_preview(df, message): |
|
|
"""Update the preview display.""" |
|
|
display_df = format_for_display(df) |
|
|
return display_df, message |
|
|
|
|
|
def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp): |
|
|
"""Process CSV and yield incremental updates.""" |
|
|
if file_obj is None: |
|
|
yield None, "Please upload a CSV file", None |
|
|
|
|
|
results_df = None |
|
|
for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp): |
|
|
if df_chunk is not None: |
|
|
results_df = df_chunk |
|
|
yield format_for_display(df_chunk), progress_msg, None |
|
|
|
|
|
if results_df is not None: |
|
|
|
|
|
output_path = "processed_output.csv" |
|
|
results_df.to_csv(output_path, index=False) |
|
|
yield format_for_display(results_df), "Processing complete!", output_path |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=process_and_update, |
|
|
inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature], |
|
|
outputs=[display_df, progress_text, download_csv] |
|
|
) |
|
|
|
|
|
|
|
|
def on_file_upload(file_obj): |
|
|
if file_obj is None: |
|
|
return pd.DataFrame(), "No file uploaded" |
|
|
|
|
|
try: |
|
|
df = pd.read_csv(file_obj.name) |
|
|
preview_df = format_for_display(df.head(5)) |
|
|
info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}" |
|
|
return preview_df, info |
|
|
except Exception as e: |
|
|
return pd.DataFrame(), f"Error loading file: {str(e)}" |
|
|
|
|
|
file_input.change( |
|
|
fn=on_file_upload, |
|
|
inputs=[file_input], |
|
|
outputs=[display_df, progress_text] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
debug=True |
|
|
) |