Spaces:

EfektMotyla
/

ABSA-REST-API

Sleeping

App Files Files Community

EfektMotyla commited on May 18, 2025

Commit

43c5e22

verified ·

1 Parent(s): 15f362b

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -49

app.py CHANGED Viewed

@@ -1,68 +1,113 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from typing import List
 import torch
 from transformers import (
     AutoTokenizer,
     AutoModelForTokenClassification,
     AutoModelForSequenceClassification,
-    pipeline
 )
-from transformers import MarianMTModel, MarianTokenizer
 # ────────────────────── konfiguracja ──────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Lokalne modele
-aspect_tokenizer = AutoTokenizer.from_pretrained("bert-aspect-ner", local_files_only=True, use_fast=False)
-aspect_model = AutoModelForTokenClassification.from_pretrained("bert-aspect-ner", local_files_only=True).to(device)
-aspect_model.eval()
-sentiment_tokenizer = AutoTokenizer.from_pretrained("absa-roberta", local_files_only=True)
-sentiment_model = AutoModelForSequenceClassification.from_pretrained("absa-roberta", local_files_only=True).to(device)
-sentiment_model.eval()
-# Tłumaczenia
-pl_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-pl-en",  use_auth_token=True)
-pl_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-pl-en").to(device)
-pl_to_en = pipeline("translation", model=pl_to_en_model, tokenizer=pl_to_en_tokenizer, device=0 if torch.cuda.is_available() else -1)
-en_to_pl_tokenizer = MarianTokenizer.from_pretrained("gsarti/opus-mt-tc-en-pl",  use_auth_token=True)
-en_to_pl_model = MarianMTModel.from_pretrained("gsarti/opus-mt-tc-en-pl").to(device)
-en_to_pl = pipeline("translation", model=en_to_pl_model, tokenizer=en_to_pl_tokenizer, device=0 if torch.cuda.is_available() else -1)
-# Alias słownik
-aspect_aliases = {
-    "food": "jedzenie", "service": "obsługa", "price": "cena",
-    "taste": "smak", "waiter": "obsługa", "dish": "danie",
-    "portion": "porcja", "staff": "obsługa", "decor": "wystrój",
-    "menu": "menu", "drink": "napoje", "location": "lokalizacja",
-    "time": "czas oczekiwania", "cleanliness": "czystość", "smell": "zapach",
-    "value": "cena", "experience": "doświadczenie", "recommendation": "ogólna ocena",
-    "children": "dzieci", "family": "rodzina", "pet": "zwierzęta"
-}
-# ────────────────────── Pydantic ──────────────────────
 class Comment(BaseModel):
     text: str
 class AspectSentiment(BaseModel):
     aspect: str
     sentiment: str
 class AnalysisResult(BaseModel):
     results: List[AspectSentiment]
-# ────────────────────── logika ──────────────────────
 def translate_pl_to_en(texts: list[str]) -> list[str]:
-    return [r['translation_text'] for r in pl_to_en(texts)]
 def translate_en_to_pl(texts: list[str]) -> list[str]:
-    return [r['translation_text'] for r in en_to_pl(texts)]
 def extract_aspects(text_en: str):
-    inputs = aspect_tokenizer(text_en, return_tensors="pt", truncation=True, padding=True).to(device)
     with torch.no_grad():
         outputs = aspect_model(**inputs)
@@ -85,32 +130,37 @@ def extract_aspects(text_en: str):
     if current_tokens:
         aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
-    return list({tok.replace(" ##", "").strip() for tok in aspects})
-# ────────────────────── FastAPI ──────────────────────
 app = FastAPI()
 @app.post("/analyze", response_model=AnalysisResult)
 def analyze_comment(comment: Comment):
     text_pl = comment.text
     text_en = translate_pl_to_en([text_pl])[0]
-    aspects_en = extract_aspects(text_en)
-    results = []
-    seen = set()
-    for asp in aspects_en:
         input_text = f"{text_en} [SEP] {asp}"
-        inputs = sentiment_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
         with torch.no_grad():
             logits = sentiment_model(**inputs).logits
-            pred = int(torch.argmax(logits, dim=1).cpu())
-        sentiment = ["negatywny", "neutralny", "pozytywny", "konfliktowy"][pred]
-        asp_lower = asp.lower()
-        asp_pl = aspect_aliases.get(asp_lower, translate_en_to_pl([asp])[0].lower())
-        if asp_pl not in seen:
-            seen.add(asp_pl)
-            results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment))
-    return {"results": results}

+from pathlib import Path
 from fastapi import FastAPI
 from pydantic import BaseModel
 from typing import List
+from transformers import MarianMTModel, MarianTokenizer
 import torch
 from transformers import (
     AutoTokenizer,
     AutoModelForTokenClassification,
     AutoModelForSequenceClassification,
+    pipeline,
 )
+import os
+from huggingface_hub import snapshot_download
 # ────────────────────── konfiguracja ──────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
+ROOT = Path(__file__).parent
+aspect_dir = ROOT / "bert-aspect-ner"
+sentiment_dir = ROOT / "absa-roberta"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+hf_token = os.getenv("HF_TOKEN")
+# ────────────────────── modele lokalne ─────────────────────
+aspect_tokenizer = AutoTokenizer.from_pretrained(
+    str(aspect_dir), local_files_only=True, use_fast=False        # ← jeśli brak tokenizer.json
+)
+aspect_model = AutoModelForTokenClassification.from_pretrained(
+    str(aspect_dir), local_files_only=True
+).to(device)
+sentiment_tokenizer = AutoTokenizer.from_pretrained(
+    str(sentiment_dir), local_files_only=True
+)
+sentiment_model = AutoModelForSequenceClassification.from_pretrained(
+    str(sentiment_dir), local_files_only=True
+).to(device)
+# ────────────────────── modele tłumaczeń (on-line) ─────────
+HF_CACHE_DIR = "/tmp/hf_cache"
+os.makedirs(HF_CACHE_DIR, exist_ok=True)
+os.environ["HF_HOME"] = HF_CACHE_DIR
+os.environ["TRANSFORMERS_CACHE"] = HF_CACHE_DIR
+#  Pobieramy modele
+pl_to_en_dir = snapshot_download(
+    "Helsinki-NLP/opus-mt-pl-en", token=hf_token, cache_dir=HF_CACHE_DIR
+)
+en_to_pl_dir = snapshot_download(
+    "gsarti/opus-mt-tc-en-pl", token=hf_token, cache_dir=HF_CACHE_DIR
+)
+# Ładujemy
+pl_to_en_tok = MarianTokenizer.from_pretrained(pl_to_en_dir)
+pl_to_en_mod = MarianMTModel.from_pretrained(pl_to_en_dir).to(device)
+en_to_pl_tok = MarianTokenizer.from_pretrained(en_to_pl_dir)
+en_to_pl_mod = MarianMTModel.from_pretrained(en_to_pl_dir).to(device)
+# ────────────────────── schemy Pydantic ────────────────────
 class Comment(BaseModel):
     text: str
 class AspectSentiment(BaseModel):
     aspect: str
     sentiment: str
 class AnalysisResult(BaseModel):
     results: List[AspectSentiment]
+# === Słownik aliasów aspektów EN→PL (taki sam jak wcześniej) ===
+aspect_aliases = {
+    "food": "jedzenie", "service": "obsługa", "price": "cena",
+    "taste": "smak", "waiter": "obsługa", "dish": "danie",
+    "portion": "porcja", "staff": "obsługa", "decor": "wystrój",
+    "menu": "menu", "drink": "napoje", "location": "lokalizacja",
+    "time": "czas oczekiwania", "cleanliness": "czystość", "smell": "zapach",
+    "value": "cena", "experience": "doświadczenie", "recommendation": "ogólna ocena",
+    "children": "dzieci", "family": "rodzina", "pet": "zwierzęta"
+}
+# ───────────────────── tłumaczenia  ──────────────────────
 def translate_pl_to_en(texts: list[str]) -> list[str]:
+    inputs = pl_to_en_tok(texts,
+                          return_tensors="pt",
+                          padding=True,
+                          truncation=True).to(device)
+    with torch.no_grad():
+        generated = pl_to_en_mod.generate(**inputs)
+    return pl_to_en_tok.batch_decode(generated, skip_special_tokens=True)
 def translate_en_to_pl(texts: list[str]) -> list[str]:
+    inputs = en_to_pl_tok(texts,
+                          return_tensors="pt",
+                          padding=True,
+                          truncation=True).to(device)
+    with torch.no_grad():
+        generated = en_to_pl_mod.generate(**inputs)
+    return en_to_pl_tok.batch_decode(generated, skip_special_tokens=True)
 def extract_aspects(text_en: str):
+    inputs = aspect_tokenizer(
+        text_en, return_tensors="pt", truncation=True, padding=True
+    ).to(device)
     with torch.no_grad():
         outputs = aspect_model(**inputs)
     if current_tokens:
         aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
+    # ↓ usuń spacje z „##” i zduplikowane wyniki
+    return list({tok.replace(" ##", "") for tok in aspects})
+# ────────────────────── FastAPI ────────────────────────────
 app = FastAPI()
 @app.post("/analyze", response_model=AnalysisResult)
 def analyze_comment(comment: Comment):
     text_pl = comment.text
     text_en = translate_pl_to_en([text_pl])[0]
+    aspects = extract_aspects(text_en)
+    results: list[AspectSentiment] = []
+    for asp in aspects:
         input_text = f"{text_en} [SEP] {asp}"
+        inputs = sentiment_tokenizer(
+            input_text, return_tensors="pt", truncation=True, padding=True
+        ).to(device)
         with torch.no_grad():
             logits = sentiment_model(**inputs).logits
+            predicted_class_id = int(logits.argmax().cpu())
+            sentiment_label = {
+                0: "negatywny",
+                1: "neutralny",
+                2: "pozytywny",
+                3: "konfliktowy",
+            }[predicted_class_id]
+        asp_pl = aspect_aliases.get(asp, translate_en_to_pl([asp])[0].lower())
+        results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment_label))
+    return {"results": results}