Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +2 -3
app.py +48 -0
bert_imdb_sentiment.pth +3 -0
data/imdb_data.py +27 -0
hub.py +7 -0
model/sentiment_model.py +23 -0
predict.py +81 -0
schemas/sentiment.py +13 -0
services/inference.py +50 -0
streamlit_app.py +61 -0
test.py +16 -0
train.py +105 -0

README.md CHANGED Viewed

@@ -1,3 +1,2 @@
----
-license: apache-2.0
----


1	+ # imdb_sentiment_analysis
2	+ 基于 BERT 的 IMDB 电影评论情感分析 FastAPI 服务，已经使用streamlit简单实现了前端

app.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# app.py（现在）
+from fastapi import FastAPI
+import torch
+from transformers import BertTokenizer
+from model.sentiment_model import SentimentAnalysisModel
+from schemas.sentiment import SentimentRequest, SentimentResponse
+from services.inference import predict_sentiment
+from schemas.sentiment import BatchSentimentRequest
+from services.inference import batch_predict
+from fastapi.concurrency import run_in_threadpool
+app = FastAPI()
+@app.on_event("startup")
+def startup_event():
+    global tokenizer, model, device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = SentimentAnalysisModel("bert-base-uncased")
+    model.load_state_dict(torch.load("bert_imdb_sentiment.pth", map_location=device))
+    model.to(device)
+    model.eval()
+@app.post("/predict", response_model=SentimentResponse)
+async def predict_api(req: SentimentRequest):
+    label, conf = await run_in_threadpool(
+        predict_sentiment, req.text, tokenizer, model, device
+    )
+    return SentimentResponse(label=label, confidence=conf)
+@app.post("/predict_batch")
+async def predict_batch_api(req: BatchSentimentRequest):
+    results = await run_in_threadpool(
+        batch_predict, req.texts, tokenizer, model, device
+    )
+    return results
+@app.get("/health")
+def health():
+    return {"status": "ok"}

bert_imdb_sentiment.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:399289093f3a22c71110cfcf4f558f8611d025711e74e2de6b39de9236ac04b2
+size 438019196

data/imdb_data.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset
+class IMDBDataset(Dataset):
+    def __init__(self, split, tokenizer, max_length=256):
+        print(f"Loading IMDB {split} dataset...")
+        self.dataset = load_dataset("imdb")[split]
+        print(f"IMDB {split} loaded.")
+        self.encodings = tokenizer(
+            self.dataset["text"],
+            truncation=True,
+            padding=True,
+            max_length=max_length
+        )
+        self.labels = self.dataset["label"]
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        return {
+            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
+            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
+            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
+        }

hub.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from huggingface_hub import login, upload_folder
+# Login with your Hugging Face token (embedded)
+login()
+# Push your model files
+upload_folder(folder_path=".", repo_id="ikkbor/bert_imdb_sentiment", repo_type="model")

model/sentiment_model.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+from transformers import BertModel
+class SentimentAnalysisModel(nn.Module):
+    def __init__(self, pretrained_model_name="bert-base-uncased"):
+        super().__init__()
+        self.bert = BertModel.from_pretrained(pretrained_model_name)
+        self.dropout = nn.Dropout(0.3)
+        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        pooled_output = outputs.pooler_output
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits

predict.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from transformers import BertTokenizer
+from model.sentiment_model import SentimentAnalysisModel
+# 设备
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# 1. 加载 tokenizer
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+# 2. 加载模型结构
+model = SentimentAnalysisModel("bert-base-uncased")
+# 3. 加载训练好的权重
+model.load_state_dict(
+    torch.load("bert_imdb_sentiment.pth", map_location=device)
+)
+model.to(device)
+model.eval()  # ⚠️ 非常重要
+print("Model loaded successfully.")
+def predict_sentiment(text):
+    inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=256,
+        return_tensors="pt"
+    )
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        probs = torch.softmax(outputs, dim=1)
+        pred = torch.argmax(probs, dim=1).item()
+    label_map = {0: "Negative 😡", 1: "Positive 😊"}
+    return label_map[pred], probs[0][pred].item()
+def batch_predict(texts):
+    inputs = tokenizer(
+        texts,
+        padding=True,
+        truncation=True,
+        max_length=256,
+        return_tensors="pt"
+    )
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        preds = torch.argmax(outputs, dim=1)
+    return preds.cpu().tolist()
+if __name__ == "__main__":
+    # text = "This movie was absolutely amazing, I loved it!"
+    texts = [
+        "This movie is terrible.",
+        "I really enjoyed this film!",
+        "Not bad, but could be better."
+    ]
+    results = batch_predict(texts)
+    print(results)  # [0, 1, 1]
+    # | 数值 | 含义 |
+    # | -- | -------- |
+    # | 0 | Negative |
+    # | 1 | Positive |
+    # label, confidence = predict_sentiment(texts)
+    # print(f"Text: {text}")
+    # print(f"Prediction: {label}, confidence: {confidence:.4f}")

schemas/sentiment.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# schemas/sentiment.py
+from pydantic import BaseModel
+from typing import List
+class SentimentRequest(BaseModel):
+    text: str
+class SentimentResponse(BaseModel):
+    label: str
+    confidence: float
+class BatchSentimentRequest(BaseModel):
+    texts: List[str]

services/inference.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# services/inference.py
+import torch
+def predict_sentiment(text, tokenizer, model, device):
+    inputs = tokenizer(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=256,
+        return_tensors="pt"
+    )
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        probs = torch.softmax(outputs, dim=1)
+        pred = torch.argmax(probs, dim=1).item()
+    label_map = {0: "Negative", 1: "Positive"}
+    return label_map[pred], probs[0][pred].item()
+def batch_predict(texts, tokenizer, model, device):
+    inputs = tokenizer(
+        texts,
+        padding=True,
+        truncation=True,
+        max_length=256,
+        return_tensors="pt"
+    )
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask)
+        probs = torch.softmax(outputs, dim=1)
+        preds = torch.argmax(probs, dim=1)
+    label_map = {0: "Negative", 1: "Positive"}
+    return [
+        {
+            "text": text,
+            "label": label_map[p.item()],
+            "confidence": probs[i][p].item()
+        }
+        for i, (text, p) in enumerate(zip(texts, preds))
+    ]

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+<<<<<<< HEAD
+# streamlit_app.py
+import streamlit as st
+import requests
+st.title("IMDB 情感分析 Demo")
+# 单条文本输入
+text = st.text_area("输入文本", "This movie was amazing!")
+if st.button("预测情感"):
+    response = requests.post(
+        "http://127.0.0.1:8000/predict",
+        json={"text": text}
+    )
+    result = response.json()
+    st.write(f"情感：{result['label']}")
+    st.write(f"置信度：{result['confidence']:.2f}")
+# 批量文本输入
+batch_texts = st.text_area("批量文本（每行一条）", "I loved it\nNot good")
+if st.button("批量预测"):
+    texts_list = [line.strip() for line in batch_texts.split("\n") if line.strip()]
+    response = requests.post(
+        "http://127.0.0.1:8000/predict_batch",
+        json={"texts": texts_list}
+    )
+    results = response.json()
+    for r in results:
+        st.write(f"{r['text']} → {r['label']} ({r['confidence']:.2f})")
+=======
+# streamlit_app.py
+import streamlit as st
+import requests
+st.title("IMDB 情感分析 Demo")
+# 单条文本输入
+text = st.text_area("输入文本", "This movie was amazing!")
+if st.button("预测情感"):
+    response = requests.post(
+        "http://127.0.0.1:8000/predict",
+        json={"text": text}
+    )
+    result = response.json()
+    st.write(f"情感：{result['label']}")
+    st.write(f"置信度：{result['confidence']:.2f}")
+# 批量文本输入
+batch_texts = st.text_area("批量文本（每行一条）", "I loved it\nNot good")
+if st.button("批量预测"):
+    texts_list = [line.strip() for line in batch_texts.split("\n") if line.strip()]
+    response = requests.post(
+        "http://127.0.0.1:8000/predict_batch",
+        json={"texts": texts_list}
+    )
+    results = response.json()
+    for r in results:
+        st.write(f"{r['text']} → {r['label']} ({r['confidence']:.2f})")
+>>>>>>> 9ef5a78 (首次提交)

test.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import asyncio
+import aiohttp
+async def main():
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+        for i in range(5):  # 同时5个请求
+            tasks.append(session.post(
+                "http://localhost:8000/predict",
+                json={"text": f"This is test {i}"}
+            ))
+        responses = await asyncio.gather(*tasks)
+        for r in responses:
+            print(await r.json())
+asyncio.run(main())

train.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from transformers import BertTokenizer
+import torch.optim as optim
+from sklearn.metrics import accuracy_score
+from tqdm import tqdm
+from data.imdb_data import  IMDBDataset
+from model.sentiment_model import SentimentAnalysisModel
+def evaluate(model, dataloader, device):
+    model.eval()
+    preds, labels = [], []
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc="Evaluating"):
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            batch_labels = batch["labels"].to(device)
+            outputs = model(input_ids, attention_mask)
+            predictions = torch.argmax(outputs, dim=1)
+            preds.extend(predictions.cpu().tolist())
+            labels.extend(batch_labels.cpu().tolist())
+    acc = accuracy_score(labels, preds)
+    print(f"Validation Accuracy: {acc:.4f}")
+    return acc
+def train():
+    # ================== 超参数 ==================
+    model_name = "bert-base-uncased"
+    batch_size = 8
+    max_length = 256
+    lr = 2e-5
+    epochs = 3
+    # ===========================================
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("Using device:", device)
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+    train_dataset = IMDBDataset("train", tokenizer, max_length)
+    test_dataset = IMDBDataset("test", tokenizer, max_length)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+        pin_memory=True
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=4,
+        pin_memory=True
+    )
+    model = SentimentAnalysisModel(model_name).to(device)
+    print("Model device:", next(model.parameters()).device)
+    optimizer = optim.AdamW(model.parameters(), lr=lr)
+    criterion = nn.CrossEntropyLoss()
+    # ================== 训练 ==================
+    for epoch in range(epochs):
+        model.train()
+        total_loss = 0
+        loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
+        for step, batch in enumerate(loop):
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            optimizer.zero_grad()
+            outputs = model(input_ids, attention_mask)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            loop.set_postfix(loss=loss.item())
+        avg_loss = total_loss / len(train_loader)
+        print(f"\nEpoch {epoch + 1} Training Loss: {avg_loss:.4f}")
+        evaluate(model, test_loader, device)
+    # ================== 保存模型 ==================
+    torch.save(model.state_dict(), "bert_imdb_sentiment.pth")
+    print("Model saved.")
+if __name__ == "__main__":
+    train()