ghost7 commited on
Commit
57b6e56
·
verified ·
1 Parent(s): 387f63a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
3
+ import torch
4
+ import numpy as np
5
+ from sklearn.metrics import accuracy_score, f1_score
6
+
7
+ # Step 2: Load dataset
8
+ dataset = load_dataset("amazon_polarity")
9
+ train_dataset = dataset["train"].shuffle(seed=42).select(range(10000))
10
+ test_dataset = dataset["test"].shuffle(seed=42).select(range(2000))
11
+
12
+ # Step 3: Tokenize dataset
13
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
14
+ def tokenize_function(examples):
15
+ text = [title + " " + content for title, content in zip(examples["title"], examples["content"])]
16
+ return tokenizer(text, padding="max_length", truncation=True, max_length=512)
17
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
18
+ tokenized_test = test_dataset.map(tokenize_function, batched=True)
19
+ tokenized_train = tokenized_train.remove_columns(["title", "content"])
20
+ tokenized_test = tokenized_test.remove_columns(["title", "content"])
21
+ tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
22
+ tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])
23
+
24
+ # Step 4: Fine-tune model
25
+ model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
26
+ def compute_metrics(pred):
27
+ labels = pred.label_ids
28
+ preds = pred.predictions.argmax(-1)
29
+ acc = accuracy_score(labels, preds)
30
+ f1 = f1_score(labels, preds, average="weighted")
31
+ return {"accuracy": acc, "f1": f1}
32
+ training_args = TrainingArguments(
33
+ output_dir="./results",
34
+ num_train_epochs=3,
35
+ per_device_train_batch_size=16,
36
+ per_device_eval_batch_size=16,
37
+ warmup_steps=500,
38
+ weight_decay=0.01,
39
+ logging_dir="./logs",
40
+ logging_steps=10,
41
+ eval_strategy="epoch",
42
+ save_strategy="epoch",
43
+ load_best_model_at_end=True,
44
+ )
45
+ trainer = Trainer(
46
+ model=model,
47
+ args=training_args,
48
+ train_dataset=tokenized_train,
49
+ eval_dataset=tokenized_test,
50
+ compute_metrics=compute_metrics,
51
+ )
52
+ trainer.train()
53
+
54
+ # Step 5: Evaluate and predict
55
+ eval_results = trainer.evaluate()
56
+ print("Evaluation results:", eval_results)
57
+ model.save_pretrained("./fine_tuned_distilbert")
58
+ tokenizer.save_pretrained("./fine_tuned_distilbert")
59
+ def predict_sentiment(text):
60
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
61
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62
+ inputs = {key: val.to(device) for key, val in inputs.items()}
63
+ model.to(device)
64
+ with torch.no_grad():
65
+ outputs = model(**inputs)
66
+ logits = outputs.logits
67
+ predicted_class = torch.argmax(logits, dim=1).item()
68
+ return "Positive" if predicted_class == 1 else "Negative"
69
+ example_reviews = [
70
+ "Great product! Fast shipping and works perfectly as described.",
71
+ "Terrible quality, broke after one use. Very disappointed.",
72
+ "The item is okay, not amazing but does the job for the price."
73
+ ]
74
+ for review in example_reviews:
75
+ sentiment = predict_sentiment(review)
76
+ print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")
77
+
78
+ # Create Gradio interface
79
+ interface = gr.Interface(fn=predict_sentiment, inputs="text", outputs="text", title="Amazon Sentiment Analysis Demo")
80
+ interface.launch()