cognitivetech commited on
Commit
d09c502
·
verified ·
1 Parent(s): f613100

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +424 -0
app.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # Imports
3
+ # ============================================
4
+ import gradio as gr
5
+ import pandas as pd
6
+ import time
7
+ from pathlib import Path
8
+ import yaml
9
+ from typing import List, Tuple, Optional
10
+ import re
11
+ from gpt4all import GPT4All
12
+ from huggingface_hub import hf_hub_download
13
+
14
+ # ============================================
15
+ # Configuration Loading
16
+ # ============================================
17
+ with open('_config.yaml', 'r') as f:
18
+ config = yaml.safe_load(f)
19
+
20
+ # Load defaults
21
+ default_config = config.get('defaults', {})
22
+ prompts_config = config.get('prompts', {})
23
+ title_config = config.get('title_generation', {})
24
+
25
+ # Get prompts
26
+ bnotes_prompt = prompts_config.get('bnotes', {}).get('prompt', 'Write comprehensive bulleted notes summarizing the provided text, with headings and terms in bold.')
27
+ title_prompt = title_config.get('prompt', 'The content between backticks is part of a book-chapter. write 8-11 words describing it.')
28
+
29
+ # Model selection
30
+ summary_model_alias = default_config.get('summary', 'cognitivetech/obook_summary:q6_k')
31
+ title_model_alias = default_config.get('title', 'notes')
32
+
33
+ # ============================================
34
+ # Model Definitions
35
+ # ============================================
36
+ models_config = {
37
+ 'summary': {
38
+ 'repo_id': 'cognitivetech/Mistral-7b-Inst-0.2-Bulleted-Notes_GGUF',
39
+ 'filename': 'mistral-7b-inst-0.2-bulleted-notes.Q5_K_M.gguf',
40
+ 'local_dir': 'models',
41
+ 'template': {
42
+ 'prefix': '<|im_start|>user\n',
43
+ 'suffix': ' <|im_end|>\n<|im_start|>assistant\n',
44
+ 'stop_tokens': ['<|im_start|>', '<|im_end|>']
45
+ },
46
+ 'params': {
47
+ 'num_ctx': 8000,
48
+ 'num_gpu': -1, # CPU only
49
+ 'num_predict': 4000
50
+ }
51
+ },
52
+ 'title': {
53
+ 'repo_id': 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF',
54
+ 'filename': 'mistral-7b-instruct-v0.2.Q5_0.gguf',
55
+ 'local_dir': 'models',
56
+ 'template': {
57
+ 'prefix': '<s>[INST] ',
58
+ 'suffix': ' [/INST]',
59
+ 'stop_tokens': ['</s>']
60
+ },
61
+ 'params': {
62
+ 'num_ctx': 8000,
63
+ 'num_gpu': -1,
64
+ 'num_predict': 100 # Shorter for titles
65
+ }
66
+ }
67
+ }
68
+
69
+ # ============================================
70
+ # Model Initialization
71
+ # ============================================
72
+ print("Downloading and initializing models...")
73
+
74
+ # Download models
75
+ for model_type in ['summary', 'title']:
76
+ cfg = models_config[model_type]
77
+ print(f"Downloading {model_type} model...")
78
+ hf_hub_download(
79
+ repo_id=cfg['repo_id'],
80
+ filename=cfg['filename'],
81
+ local_dir=cfg['local_dir'],
82
+ local_dir_use_symlinks=False
83
+ )
84
+
85
+ # Initialize models
86
+ print("Initializing summary model...")
87
+ summary_model = GPT4All(
88
+ model_name=models_config['summary']['filename'],
89
+ model_path=models_config['summary']['local_dir'],
90
+ allow_download=False,
91
+ device="cpu"
92
+ )
93
+
94
+ print("Initializing title model...")
95
+ title_model = GPT4All(
96
+ model_name=models_config['title']['filename'],
97
+ model_path=models_config['title']['local_dir'],
98
+ allow_download=False,
99
+ device="cpu"
100
+ )
101
+
102
+ # Configure models
103
+ # Summary model uses custom template from modelfile
104
+ summary_model.config["promptTemplate"] = models_config['summary']['template']['prefix'] + "{0}" + models_config['summary']['template']['suffix']
105
+ summary_model.config["systemPrompt"] = ""
106
+
107
+ # Title model uses Mistral instruct format
108
+ title_model.config["promptTemplate"] = models_config['title']['template']['prefix'] + "{0}" + models_config['title']['template']['suffix']
109
+ title_model.config["systemPrompt"] = ""
110
+
111
+ print("Models initialized successfully!")
112
+
113
+ # ============================================
114
+ # Text Processing Functions
115
+ # ============================================
116
+ def sanitize_text(text: str) -> str:
117
+ """Clean text for processing."""
118
+ return text.strip()
119
+
120
+ def bold_text_before_colon(text: str) -> str:
121
+ """Bold any text before the first colon that isn't already bolded."""
122
+ pattern = r'^([ \t]*-[ \t]*)([a-zA-Z].*?):'
123
+ replacement = r'\1**\2:**'
124
+ return re.sub(pattern, replacement, text, flags=re.MULTILINE)
125
+
126
+ def generate_title(text: str, temperature: float = 0.3) -> str:
127
+ """Generate a title for the given text."""
128
+ prompt = f"```{text[:500]}```\n\n{title_prompt}"
129
+
130
+ # Use title model with Mistral instruct format
131
+ full_prompt = models_config['title']['template']['prefix'] + prompt + models_config['title']['template']['suffix']
132
+
133
+ outputs = []
134
+ for token in title_model.generate(
135
+ prompt=full_prompt,
136
+ temp=temperature,
137
+ top_k=40,
138
+ top_p=0.95,
139
+ max_tokens=100,
140
+ streaming=True
141
+ ):
142
+ outputs.append(token)
143
+
144
+ title = "".join(outputs).strip()
145
+ # Clean up the title (remove any remaining tags or unwanted characters)
146
+ title = re.sub(r'^.*?\[/INST\]\s*', '', title) # Remove [/INST] and anything before it
147
+ title = re.sub(r'\s+', ' ', title) # Normalize whitespace
148
+ return title[:150] # Limit to 150 chars
149
+
150
+ def generate_summary(text: str, temperature: float = 0.5, max_tokens: int = 4000) -> str:
151
+ """Generate bulleted notes summary."""
152
+ prompt = f"```{text}```\n\n{bnotes_prompt}"
153
+
154
+ # Use custom template from modelfile
155
+ full_prompt = models_config['summary']['template']['prefix'] + prompt + models_config['summary']['template']['suffix']
156
+
157
+ outputs = []
158
+ for token in summary_model.generate(
159
+ prompt=full_prompt,
160
+ temp=temperature,
161
+ top_k=40,
162
+ top_p=0.95,
163
+ max_tokens=max_tokens,
164
+ streaming=True
165
+ ):
166
+ outputs.append(token)
167
+
168
+ summary = "".join(outputs).strip()
169
+ # Clean up the response
170
+ summary = re.sub(r'^.*?assistant\s*', '', summary) # Remove "assistant" prefix
171
+ summary = bold_text_before_colon(summary)
172
+ return summary
173
+
174
+ # ============================================
175
+ # Processing Functions
176
+ # ============================================
177
+ def process_csv(
178
+ file_obj,
179
+ use_existing_titles: bool = True,
180
+ generate_missing_titles: bool = True,
181
+ temperature: float = 0.5,
182
+ title_temperature: float = 0.3
183
+ ):
184
+ """Process CSV file with title and text columns."""
185
+
186
+ # Read CSV
187
+ try:
188
+ df = pd.read_csv(file_obj.name)
189
+ except Exception as e:
190
+ return None, f"Error reading CSV: {str(e)}"
191
+
192
+ # Check required columns
193
+ if 'text' not in df.columns:
194
+ return None, "CSV must contain 'text' column"
195
+
196
+ # Prepare output DataFrame
197
+ output_rows = []
198
+
199
+ # Process each row
200
+ for idx, row in df.iterrows():
201
+ text = str(row.get('text', ''))
202
+ original_title = str(row.get('title', '')) if 'title' in df.columns and use_existing_titles else ''
203
+
204
+ # Skip empty text
205
+ if not text.strip():
206
+ continue
207
+
208
+ # Generate or use title
209
+ start_time = time.time()
210
+
211
+ if original_title and use_existing_titles:
212
+ title = original_title
213
+ title_generated = False
214
+ elif generate_missing_titles:
215
+ title = generate_title(text, temperature=title_temperature)
216
+ title_generated = True
217
+ else:
218
+ title = f"Text_{idx+1}"
219
+ title_generated = False
220
+
221
+ # Generate summary
222
+ summary = generate_summary(text, temperature=temperature)
223
+ end_time = time.time()
224
+
225
+ # Calculate metrics
226
+ elapsed_time = end_time - start_time
227
+
228
+ # Prepare output row
229
+ output_row = {
230
+ 'title': title,
231
+ 'text': text,
232
+ 'text.len': len(text),
233
+ 'output': summary,
234
+ 'output.len': len(summary),
235
+ 'time': elapsed_time
236
+ }
237
+
238
+ # Add original title if it exists
239
+ if original_title and use_existing_titles:
240
+ output_row['original_title'] = original_title
241
+ output_row['title_generated'] = title_generated
242
+
243
+ output_rows.append(output_row)
244
+
245
+ # Yield intermediate progress
246
+ yield pd.DataFrame(output_rows), f"Processed {idx+1}/{len(df)} rows..."
247
+
248
+ # Create final DataFrame
249
+ output_df = pd.DataFrame(output_rows)
250
+
251
+ return output_df, f"Processing complete! Processed {len(output_df)} rows."
252
+
253
+ def format_for_display(df):
254
+ """Format DataFrame for nice display."""
255
+ if df is None or len(df) == 0:
256
+ return pd.DataFrame()
257
+
258
+ display_df = df.copy()
259
+
260
+ # Truncate long columns for display
261
+ if 'text' in display_df.columns:
262
+ display_df['text'] = display_df['text'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
263
+
264
+ if 'output' in display_df.columns:
265
+ display_df['output'] = display_df['output'].apply(lambda x: x[:200] + '...' if len(str(x)) > 200 else x)
266
+
267
+ # Format time column
268
+ if 'time' in display_df.columns:
269
+ display_df['time'] = display_df['time'].apply(lambda x: f"{x:.2f}s")
270
+
271
+ # Reorder columns for display
272
+ display_order = ['title', 'text.len', 'output.len', 'time']
273
+ display_order = [col for col in display_order if col in display_df.columns]
274
+
275
+ # Add remaining columns
276
+ other_cols = [col for col in display_df.columns if col not in display_order]
277
+ display_order.extend(other_cols)
278
+
279
+ return display_df[display_order]
280
+
281
+ # ============================================
282
+ # Gradio Interface
283
+ # ============================================
284
+ title = "Mistral-7B Text Summarizer with Title Generation"
285
+ description = """
286
+ Process CSV files with text content and generate:
287
+ 1. Titles (using Mistral-7B-Instruct-v0.2)
288
+ 2. Bulleted notes summaries (using Mistral-7b-Inst-0.2-Bulleted-Notes)
289
+
290
+ CSV must contain at least a 'text' column. Optionally include 'title' column to use existing titles.
291
+ """
292
+
293
+ with gr.Blocks(title=title, css="""
294
+ .output-table { max-height: 500px; overflow-y: auto; }
295
+ .progress-text { color: #666; font-style: italic; }
296
+ """) as demo:
297
+
298
+ gr.Markdown(f"# {title}")
299
+ gr.Markdown(description)
300
+
301
+ with gr.Row():
302
+ with gr.Column(scale=1):
303
+ # Input Section
304
+ gr.Markdown("## Input Settings")
305
+
306
+ file_input = gr.File(
307
+ label="Upload CSV File",
308
+ file_types=[".csv"],
309
+ type="file"
310
+ )
311
+
312
+ use_existing_titles = gr.Checkbox(
313
+ label="Use existing titles from CSV",
314
+ value=True,
315
+ info="If unchecked, will generate titles for all rows"
316
+ )
317
+
318
+ generate_missing_titles = gr.Checkbox(
319
+ label="Generate titles for missing rows",
320
+ value=True,
321
+ info="Generate titles only when 'title' column is empty"
322
+ )
323
+
324
+ temperature = gr.Slider(
325
+ label="Summary Temperature",
326
+ value=0.5,
327
+ minimum=0.0,
328
+ maximum=1.0,
329
+ step=0.05,
330
+ info="Higher values = more creative, lower = more deterministic"
331
+ )
332
+
333
+ title_temperature = gr.Slider(
334
+ label="Title Temperature",
335
+ value=0.3,
336
+ minimum=0.0,
337
+ maximum=1.0,
338
+ step=0.05,
339
+ info="Temperature for title generation"
340
+ )
341
+
342
+ process_btn = gr.Button("Process CSV", variant="primary")
343
+
344
+ with gr.Column(scale=2):
345
+ # Output Section
346
+ gr.Markdown("## Results")
347
+
348
+ progress_text = gr.Textbox(
349
+ label="Progress",
350
+ value="Ready to process...",
351
+ interactive=False
352
+ )
353
+
354
+ display_df = gr.Dataframe(
355
+ label="Preview",
356
+ headers=[],
357
+ datatype=["str", "str", "number", "number", "str"],
358
+ row_count=5,
359
+ col_count=(5, "fixed"),
360
+ wrap=True,
361
+ elem_classes=["output-table"]
362
+ )
363
+
364
+ download_csv = gr.File(label="Download Full Results")
365
+
366
+ # Event handlers
367
+ def update_preview(df, message):
368
+ """Update the preview display."""
369
+ display_df = format_for_display(df)
370
+ return display_df, message
371
+
372
+ def process_and_update(file_obj, use_titles, gen_missing, temp, title_temp):
373
+ """Process CSV and yield incremental updates."""
374
+ if file_obj is None:
375
+ yield None, "Please upload a CSV file", None
376
+
377
+ results_df = None
378
+ for df_chunk, progress_msg in process_csv(file_obj, use_titles, gen_missing, temp, title_temp):
379
+ if df_chunk is not None:
380
+ results_df = df_chunk
381
+ yield format_for_display(df_chunk), progress_msg, None
382
+
383
+ if results_df is not None:
384
+ # Save to temporary file for download
385
+ output_path = "processed_output.csv"
386
+ results_df.to_csv(output_path, index=False)
387
+ yield format_for_display(results_df), "Processing complete!", output_path
388
+
389
+ # Connect events
390
+ process_btn.click(
391
+ fn=process_and_update,
392
+ inputs=[file_input, use_existing_titles, generate_missing_titles, temperature, title_temperature],
393
+ outputs=[display_df, progress_text, download_csv]
394
+ )
395
+
396
+ # Update preview when file is uploaded
397
+ def on_file_upload(file_obj):
398
+ if file_obj is None:
399
+ return pd.DataFrame(), "No file uploaded"
400
+
401
+ try:
402
+ df = pd.read_csv(file_obj.name)
403
+ preview_df = format_for_display(df.head(5))
404
+ info = f"Loaded {len(df)} rows. Columns: {', '.join(df.columns.tolist())}"
405
+ return preview_df, info
406
+ except Exception as e:
407
+ return pd.DataFrame(), f"Error loading file: {str(e)}"
408
+
409
+ file_input.change(
410
+ fn=on_file_upload,
411
+ inputs=[file_input],
412
+ outputs=[display_df, progress_text]
413
+ )
414
+
415
+ # ============================================
416
+ # Launch Application
417
+ # ============================================
418
+ if __name__ == "__main__":
419
+ demo.launch(
420
+ server_name="0.0.0.0",
421
+ server_port=7860,
422
+ share=False,
423
+ debug=True
424
+ )