Spaces:

AishaniS
/

WhatsAppChatSummarizerModel

Sleeping

App Files Files Community

AishaniS commited on Dec 29, 2025

Commit

424a606

verified ·

1 Parent(s): 52350b0

Upload 3 files

Browse files

Files changed (3) hide show

module_2_preprocessing.py +29 -0
module_3_model.py +16 -0
module_4_evaluation.py +46 -0

module_2_preprocessing.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Save as: module_2_preprocessing.py
+from transformers import AutoTokenizer
+import pandas as pd
+# 1. Load YOUR LOCAL Tokenizer
+print("Loading local tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
+# 2. Simulate Raw WhatsApp Data
+raw_chat = """
+12/05/2025, 10:00 PM - John: Hey, are we meeting tomorrow?
+12/05/2025, 10:01 PM - Sarah: Yes, at the cafe.
+"""
+# 3. Preprocess (Clean & Tokenize)
+def clean_text(text):
+    # Simple cleaning for demo
+    return text.replace("12/05/2025, 10:00 PM - ", "").replace("12/05/2025, 10:01 PM - ", "")
+cleaned_text = clean_text(raw_chat)
+print(f"\nCleaned Text:\n{cleaned_text}")
+# 4. Tokenization (The Core Requirement)
+tokens = tokenizer(cleaned_text, truncation=True, padding="max_length", max_length=50)
+print("\n--- Tokenization Output (First 20 tokens) ---")
+print(f"Input IDs: {tokens['input_ids'][:20]}")
+print(f"Attention Mask: {tokens['attention_mask'][:20]}")
+print("\n[Success] Preprocessing module demonstrated with local tokenizer.")

module_3_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Save as: module_3_model.py
+from transformers import AutoModelForSeq2SeqLM
+# 1. Load YOUR LOCAL Model
+print("Loading local model architecture...")
+model = AutoModelForSeq2SeqLM.from_pretrained("./pegasus_model")
+# 2. Display Architecture Details (Requirement for Module 3)
+print(f"\nModel Type: {model.config.model_type}")
+print(f"Vocab Size: {model.config.vocab_size}")
+print(f"Max Position Embeddings: {model.config.max_position_embeddings}")
+print(f"Encoder Layers: {model.config.encoder_layers}")
+print(f"Decoder Layers: {model.config.decoder_layers}")
+print("\n--- Full Architecture (Snippet) ---")
+print(model)

module_4_evaluation.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Save as: module_4_evaluation.py
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from datasets import load_dataset
+import evaluate
+# 1. Setup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+rouge = evaluate.load("rouge")
+print(f"Running evaluation on: {device}")
+# 2. Load LOCAL Artifacts
+tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
+model = AutoModelForSeq2SeqLM.from_pretrained("./pegasus_model").to(device)
+# 3. Load Test Data (Real validation data)
+dataset = load_dataset("knkarthick/samsum", split="test[:10]") # Testing on 10 samples for speed
+print("Dataset loaded.")
+def generate_summary(batch):
+    inputs = tokenizer(batch["dialogue"], return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)
+    # Generate
+    summary_ids = model.generate(
+        inputs["input_ids"],
+        max_length=128,
+        num_beams=4,
+        length_penalty=0.8
+    )
+    # Decode
+    batch["pred_summary"] = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    return batch
+# 4. Run Inference
+print("Generating summaries for evaluation...")
+results = dataset.map(generate_summary, batched=True, batch_size=2)
+# 5. Calculate Metrics
+print("Computing ROUGE scores...")
+scores = rouge.compute(predictions=results["pred_summary"], references=results["summary"])
+print("\n--- Evaluation Results (ROUGE) ---")
+print(f"ROUGE-1: {scores['rouge1']:.4f}")
+print(f"ROUGE-2: {scores['rouge2']:.4f}")
+print(f"ROUGE-L: {scores['rougeL']:.4f}")