Spaces:
Sleeping
Sleeping
File size: 5,113 Bytes
9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e f2c174e 9d9d143 e69a3fb 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 e69a3fb 9d9d143 e69a3fb 9d9d143 e69a3fb 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 e69a3fb 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from pathlib import Path
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
import torch
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
AutoModelForSequenceClassification,
pipeline,
)
# ββββββββββββββββββββββ konfiguracja ββββββββββββββββββββββ
device = "cuda" if torch.cuda.is_available() else "cpu"
ROOT = Path(__file__).parent
MODELS_DIR = ROOT / "models"
aspect_dir = MODELS_DIR / "bert-aspect-ner"
sentiment_dir = MODELS_DIR / "absa-roberta"
# ββββββββββββββββββββββ modele lokalne βββββββββββββββββββββ
aspect_tokenizer = AutoTokenizer.from_pretrained(
str(aspect_dir), local_files_only=True, use_fast=False # β jeΕli brak tokenizer.json
)
aspect_model = AutoModelForTokenClassification.from_pretrained(
str(aspect_dir), local_files_only=True
).to(device)
sentiment_tokenizer = AutoTokenizer.from_pretrained(
str(sentiment_dir), local_files_only=True
)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(
str(sentiment_dir), local_files_only=True
).to(device)
# ββββββββββββββββββββββ modele tΕumaczeΕ (on-line) βββββββββ
pl_to_en = pipeline(
"translation",
model="Helsinki-NLP/opus-mt-pl-en",
device=0 if device == "cuda" else -1,
)
en_to_pl = pipeline(
"translation",
model="gsarti/opus-mt-tc-en-pl",
device=0 if device == "cuda" else -1,
)
# ββββββββββββββββββββββ schemy Pydantic ββββββββββββββββββββ
class Comment(BaseModel):
text: str
class AspectSentiment(BaseModel):
aspect: str
sentiment: str
class AnalysisResult(BaseModel):
results: List[AspectSentiment]
# === SΕownik aliasΓ³w aspektΓ³w ENβPL (taki sam jak wczeΕniej) ===
aspect_aliases = {
"food": "jedzenie", "service": "obsΕuga", "price": "cena",
"taste": "smak", "waiter": "obsΕuga", "dish": "danie",
"portion": "porcja", "staff": "obsΕuga", "decor": "wystrΓ³j",
"menu": "menu", "drink": "napoje", "location": "lokalizacja",
"time": "czas oczekiwania", "cleanliness": "czystoΕΔ", "smell": "zapach",
"value": "cena", "experience": "doΕwiadczenie", "recommendation": "ogΓ³lna ocena",
"children": "dzieci", "family": "rodzina", "pet": "zwierzΔta"
}
def translate_pl_to_en(texts):
return [res["translation_text"] for res in pl_to_en(texts)]
def translate_en_to_pl(texts):
return [res["translation_text"] for res in en_to_pl(texts)]
def extract_aspects(text_en: str):
inputs = aspect_tokenizer(
text_en, return_tensors="pt", truncation=True, padding=True
).to(device)
with torch.no_grad():
outputs = aspect_model(**inputs)
preds = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
tokens = aspect_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [aspect_model.config.id2label[p] for p in preds]
aspects, current_tokens = [], []
for token, label in zip(tokens, labels):
if label == "B-ASP":
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
current_tokens = [token]
elif label == "I-ASP" and current_tokens:
current_tokens.append(token)
else:
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
current_tokens = []
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
# β usuΕ spacje z β##β i zduplikowane wyniki
return list({tok.replace(" ##", "") for tok in aspects})
# ββββββββββββββββββββββ FastAPI ββββββββββββββββββββββββββββ
app = FastAPI()
@app.post("/analyze", response_model=AnalysisResult)
def analyze_comment(comment: Comment):
text_pl = comment.text
text_en = translate_pl_to_en([text_pl])[0]
aspects = extract_aspects(text_en)
results: list[AspectSentiment] = []
for asp in aspects:
input_text = f"{text_en} [SEP] {asp}"
inputs = sentiment_tokenizer(
input_text, return_tensors="pt", truncation=True, padding=True
).to(device)
with torch.no_grad():
logits = sentiment_model(**inputs).logits
predicted_class_id = int(logits.argmax().cpu())
sentiment_label = {
0: "negatywny",
1: "neutralny",
2: "pozytywny",
3: "konfliktowy",
}[predicted_class_id]
asp_pl = aspect_aliases.get(asp, translate_en_to_pl([asp])[0].lower())
results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment_label))
return {"results": results} |