AishaniS commited on
Commit
424a606
·
verified ·
1 Parent(s): 52350b0

Upload 3 files

Browse files
module_2_preprocessing.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Save as: module_2_preprocessing.py
2
+ from transformers import AutoTokenizer
3
+ import pandas as pd
4
+
5
+ # 1. Load YOUR LOCAL Tokenizer
6
+ print("Loading local tokenizer...")
7
+ tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
8
+
9
+ # 2. Simulate Raw WhatsApp Data
10
+ raw_chat = """
11
+ 12/05/2025, 10:00 PM - John: Hey, are we meeting tomorrow?
12
+ 12/05/2025, 10:01 PM - Sarah: Yes, at the cafe.
13
+ """
14
+
15
+ # 3. Preprocess (Clean & Tokenize)
16
+ def clean_text(text):
17
+ # Simple cleaning for demo
18
+ return text.replace("12/05/2025, 10:00 PM - ", "").replace("12/05/2025, 10:01 PM - ", "")
19
+
20
+ cleaned_text = clean_text(raw_chat)
21
+ print(f"\nCleaned Text:\n{cleaned_text}")
22
+
23
+ # 4. Tokenization (The Core Requirement)
24
+ tokens = tokenizer(cleaned_text, truncation=True, padding="max_length", max_length=50)
25
+
26
+ print("\n--- Tokenization Output (First 20 tokens) ---")
27
+ print(f"Input IDs: {tokens['input_ids'][:20]}")
28
+ print(f"Attention Mask: {tokens['attention_mask'][:20]}")
29
+ print("\n[Success] Preprocessing module demonstrated with local tokenizer.")
module_3_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Save as: module_3_model.py
2
+ from transformers import AutoModelForSeq2SeqLM
3
+
4
+ # 1. Load YOUR LOCAL Model
5
+ print("Loading local model architecture...")
6
+ model = AutoModelForSeq2SeqLM.from_pretrained("./pegasus_model")
7
+
8
+ # 2. Display Architecture Details (Requirement for Module 3)
9
+ print(f"\nModel Type: {model.config.model_type}")
10
+ print(f"Vocab Size: {model.config.vocab_size}")
11
+ print(f"Max Position Embeddings: {model.config.max_position_embeddings}")
12
+ print(f"Encoder Layers: {model.config.encoder_layers}")
13
+ print(f"Decoder Layers: {model.config.decoder_layers}")
14
+
15
+ print("\n--- Full Architecture (Snippet) ---")
16
+ print(model)
module_4_evaluation.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Save as: module_4_evaluation.py
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ from datasets import load_dataset
5
+ import evaluate
6
+
7
+ # 1. Setup
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ rouge = evaluate.load("rouge")
10
+ print(f"Running evaluation on: {device}")
11
+
12
+ # 2. Load LOCAL Artifacts
13
+ tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
14
+ model = AutoModelForSeq2SeqLM.from_pretrained("./pegasus_model").to(device)
15
+
16
+ # 3. Load Test Data (Real validation data)
17
+ dataset = load_dataset("knkarthick/samsum", split="test[:10]") # Testing on 10 samples for speed
18
+ print("Dataset loaded.")
19
+
20
+ def generate_summary(batch):
21
+ inputs = tokenizer(batch["dialogue"], return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)
22
+
23
+ # Generate
24
+ summary_ids = model.generate(
25
+ inputs["input_ids"],
26
+ max_length=128,
27
+ num_beams=4,
28
+ length_penalty=0.8
29
+ )
30
+
31
+ # Decode
32
+ batch["pred_summary"] = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
33
+ return batch
34
+
35
+ # 4. Run Inference
36
+ print("Generating summaries for evaluation...")
37
+ results = dataset.map(generate_summary, batched=True, batch_size=2)
38
+
39
+ # 5. Calculate Metrics
40
+ print("Computing ROUGE scores...")
41
+ scores = rouge.compute(predictions=results["pred_summary"], references=results["summary"])
42
+
43
+ print("\n--- Evaluation Results (ROUGE) ---")
44
+ print(f"ROUGE-1: {scores['rouge1']:.4f}")
45
+ print(f"ROUGE-2: {scores['rouge2']:.4f}")
46
+ print(f"ROUGE-L: {scores['rougeL']:.4f}")