Spaces:
Sleeping
Sleeping
File size: 5,089 Bytes
9d9d143 5ac897e 9d9d143 5ac897e 9d9d143 7ae4f2b 5ac897e 01caf5c e34ebb0 9d9d143 e69a3fb 5ac897e 7ae4f2b 668f19f 7ae4f2b 5ac897e 7ae4f2b 01caf5c 38026de 7ae4f2b 38026de 7ae4f2b 5ac897e 7ae4f2b 94eceb2 7ae4f2b 9d9d143 94eceb2 7ae4f2b 9d9d143 7ae4f2b 5ac897e 9d9d143 5ac897e 9d9d143 5ac897e 7ae4f2b 9d9d143 7ae4f2b e69a3fb 5ac897e 7ae4f2b 5ac897e 7ae4f2b 5ac897e 7ae4f2b 5ac897e 7ae4f2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
import torch
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
AutoModelForSequenceClassification,
pipeline
)
from transformers import MarianMTModel, MarianTokenizer
# ββββββββββββββββββββββ konfiguracja ββββββββββββββββββββββ
device = "cuda" if torch.cuda.is_available() else "cpu"
# Lokalne modele
aspect_tokenizer = AutoTokenizer.from_pretrained("bert-aspect-ner", local_files_only=True, use_fast=False)
aspect_model = AutoModelForTokenClassification.from_pretrained("bert-aspect-ner", local_files_only=True).to(device)
aspect_model.eval()
sentiment_tokenizer = AutoTokenizer.from_pretrained("absa-roberta", local_files_only=True)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("absa-roberta", local_files_only=True).to(device)
sentiment_model.eval()
# TΕumaczenia
pl_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-pl-en")
pl_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-pl-en").to(device)
pl_to_en = pipeline("translation", model=pl_to_en_model, tokenizer=pl_to_en_tokenizer, device=0 if torch.cuda.is_available() else -1)
en_to_pl_tokenizer = MarianTokenizer.from_pretrained("gsarti/opus-mt-tc-en-pl")
en_to_pl_model = MarianMTModel.from_pretrained("gsarti/opus-mt-tc-en-pl").to(device)
en_to_pl = pipeline("translation", model=en_to_pl_model, tokenizer=en_to_pl_tokenizer, device=0 if torch.cuda.is_available() else -1)
# Alias sΕownik
aspect_aliases = {
"food": "jedzenie", "service": "obsΕuga", "price": "cena",
"taste": "smak", "waiter": "obsΕuga", "dish": "danie",
"portion": "porcja", "staff": "obsΕuga", "decor": "wystrΓ³j",
"menu": "menu", "drink": "napoje", "location": "lokalizacja",
"time": "czas oczekiwania", "cleanliness": "czystoΕΔ", "smell": "zapach",
"value": "cena", "experience": "doΕwiadczenie", "recommendation": "ogΓ³lna ocena",
"children": "dzieci", "family": "rodzina", "pet": "zwierzΔta"
}
# ββββββββββββββββββββββ Pydantic ββββββββββββββββββββββ
class Comment(BaseModel):
text: str
class AspectSentiment(BaseModel):
aspect: str
sentiment: str
class AnalysisResult(BaseModel):
results: List[AspectSentiment]
# ββββββββββββββββββββββ logika ββββββββββββββββββββββ
def translate_pl_to_en(texts: list[str]) -> list[str]:
return [r['translation_text'] for r in pl_to_en(texts)]
def translate_en_to_pl(texts: list[str]) -> list[str]:
return [r['translation_text'] for r in en_to_pl(texts)]
def extract_aspects(text_en: str):
inputs = aspect_tokenizer(text_en, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
outputs = aspect_model(**inputs)
preds = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
tokens = aspect_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [aspect_model.config.id2label[p] for p in preds]
aspects, current_tokens = [], []
for token, label in zip(tokens, labels):
if label == "B-ASP":
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
current_tokens = [token]
elif label == "I-ASP" and current_tokens:
current_tokens.append(token)
else:
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
current_tokens = []
if current_tokens:
aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
return list({tok.replace(" ##", "").strip() for tok in aspects})
# ββββββββββββββββββββββ FastAPI ββββββββββββββββββββββ
app = FastAPI()
@app.post("/analyze", response_model=AnalysisResult)
def analyze_comment(comment: Comment):
text_pl = comment.text
text_en = translate_pl_to_en([text_pl])[0]
aspects_en = extract_aspects(text_en)
results = []
seen = set()
for asp in aspects_en:
input_text = f"{text_en} [SEP] {asp}"
inputs = sentiment_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
logits = sentiment_model(**inputs).logits
pred = int(torch.argmax(logits, dim=1).cpu())
sentiment = ["negatywny", "neutralny", "pozytywny", "konfliktowy"][pred]
asp_lower = asp.lower()
asp_pl = aspect_aliases.get(asp_lower, translate_en_to_pl([asp])[0].lower())
if asp_pl not in seen:
seen.add(asp_pl)
results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment))
return {"results": results}
|