EfektMotyla commited on
Commit
43c5e22
Β·
verified Β·
1 Parent(s): 15f362b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -49
app.py CHANGED
@@ -1,68 +1,113 @@
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from typing import List
 
4
  import torch
5
  from transformers import (
6
  AutoTokenizer,
7
  AutoModelForTokenClassification,
8
  AutoModelForSequenceClassification,
9
- pipeline
10
  )
11
- from transformers import MarianMTModel, MarianTokenizer
 
12
 
13
  # ────────────────────── konfiguracja ──────────────────────
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- # Lokalne modele
17
- aspect_tokenizer = AutoTokenizer.from_pretrained("bert-aspect-ner", local_files_only=True, use_fast=False)
18
- aspect_model = AutoModelForTokenClassification.from_pretrained("bert-aspect-ner", local_files_only=True).to(device)
19
- aspect_model.eval()
20
 
21
- sentiment_tokenizer = AutoTokenizer.from_pretrained("absa-roberta", local_files_only=True)
22
- sentiment_model = AutoModelForSequenceClassification.from_pretrained("absa-roberta", local_files_only=True).to(device)
23
- sentiment_model.eval()
24
 
25
- # TΕ‚umaczenia
26
 
27
- pl_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-pl-en", use_auth_token=True)
28
- pl_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-pl-en").to(device)
29
- pl_to_en = pipeline("translation", model=pl_to_en_model, tokenizer=pl_to_en_tokenizer, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
30
 
31
- en_to_pl_tokenizer = MarianTokenizer.from_pretrained("gsarti/opus-mt-tc-en-pl", use_auth_token=True)
32
- en_to_pl_model = MarianMTModel.from_pretrained("gsarti/opus-mt-tc-en-pl").to(device)
33
- en_to_pl = pipeline("translation", model=en_to_pl_model, tokenizer=en_to_pl_tokenizer, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Alias sΕ‚ownik
36
- aspect_aliases = {
37
- "food": "jedzenie", "service": "obsΕ‚uga", "price": "cena",
38
- "taste": "smak", "waiter": "obsΕ‚uga", "dish": "danie",
39
- "portion": "porcja", "staff": "obsΕ‚uga", "decor": "wystrΓ³j",
40
- "menu": "menu", "drink": "napoje", "location": "lokalizacja",
41
- "time": "czas oczekiwania", "cleanliness": "czystoΕ›Δ‡", "smell": "zapach",
42
- "value": "cena", "experience": "doΕ›wiadczenie", "recommendation": "ogΓ³lna ocena",
43
- "children": "dzieci", "family": "rodzina", "pet": "zwierzΔ™ta"
44
- }
45
 
46
- # ────────────────────── Pydantic ──────────────────────
 
 
47
  class Comment(BaseModel):
48
  text: str
49
 
 
50
  class AspectSentiment(BaseModel):
51
  aspect: str
52
  sentiment: str
53
 
 
54
  class AnalysisResult(BaseModel):
55
  results: List[AspectSentiment]
56
 
57
- # ────────────────────── logika ──────────────────────
 
 
 
 
 
 
 
 
 
 
58
  def translate_pl_to_en(texts: list[str]) -> list[str]:
59
- return [r['translation_text'] for r in pl_to_en(texts)]
 
 
 
 
 
 
 
60
 
61
  def translate_en_to_pl(texts: list[str]) -> list[str]:
62
- return [r['translation_text'] for r in en_to_pl(texts)]
 
 
 
 
 
 
 
63
 
64
  def extract_aspects(text_en: str):
65
- inputs = aspect_tokenizer(text_en, return_tensors="pt", truncation=True, padding=True).to(device)
 
 
66
  with torch.no_grad():
67
  outputs = aspect_model(**inputs)
68
 
@@ -85,32 +130,37 @@ def extract_aspects(text_en: str):
85
  if current_tokens:
86
  aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
87
 
88
- return list({tok.replace(" ##", "").strip() for tok in aspects})
 
 
89
 
90
- # ────────────────────── FastAPI ──────────────────────
91
  app = FastAPI()
92
 
 
93
  @app.post("/analyze", response_model=AnalysisResult)
94
  def analyze_comment(comment: Comment):
95
  text_pl = comment.text
96
  text_en = translate_pl_to_en([text_pl])[0]
97
- aspects_en = extract_aspects(text_en)
98
 
99
- results = []
100
- seen = set()
101
- for asp in aspects_en:
102
  input_text = f"{text_en} [SEP] {asp}"
103
- inputs = sentiment_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
 
 
104
  with torch.no_grad():
105
  logits = sentiment_model(**inputs).logits
106
- pred = int(torch.argmax(logits, dim=1).cpu())
107
- sentiment = ["negatywny", "neutralny", "pozytywny", "konfliktowy"][pred]
108
-
109
- asp_lower = asp.lower()
110
- asp_pl = aspect_aliases.get(asp_lower, translate_en_to_pl([asp])[0].lower())
111
-
112
- if asp_pl not in seen:
113
- seen.add(asp_pl)
114
- results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment))
115
-
116
- return {"results": results}
 
 
1
+ from pathlib import Path
2
+
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel
5
  from typing import List
6
+ from transformers import MarianMTModel, MarianTokenizer
7
  import torch
8
  from transformers import (
9
  AutoTokenizer,
10
  AutoModelForTokenClassification,
11
  AutoModelForSequenceClassification,
12
+ pipeline,
13
  )
14
+ import os
15
+ from huggingface_hub import snapshot_download
16
 
17
  # ────────────────────── konfiguracja ──────────────────────
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
20
+ ROOT = Path(__file__).parent
 
 
 
21
 
22
+ aspect_dir = ROOT / "bert-aspect-ner"
23
+ sentiment_dir = ROOT / "absa-roberta"
 
24
 
 
25
 
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ hf_token = os.getenv("HF_TOKEN")
28
+ # ────────────────────── modele lokalne ─────────────────────
29
+ aspect_tokenizer = AutoTokenizer.from_pretrained(
30
+ str(aspect_dir), local_files_only=True, use_fast=False # ← jeΕ›li brak tokenizer.json
31
+ )
32
+ aspect_model = AutoModelForTokenClassification.from_pretrained(
33
+ str(aspect_dir), local_files_only=True
34
+ ).to(device)
35
 
36
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(
37
+ str(sentiment_dir), local_files_only=True
38
+ )
39
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained(
40
+ str(sentiment_dir), local_files_only=True
41
+ ).to(device)
42
+
43
+ # ────────────────────── modele tΕ‚umaczeΕ„ (on-line) ─────────
44
+ HF_CACHE_DIR = "/tmp/hf_cache"
45
+ os.makedirs(HF_CACHE_DIR, exist_ok=True)
46
+ os.environ["HF_HOME"] = HF_CACHE_DIR
47
+ os.environ["TRANSFORMERS_CACHE"] = HF_CACHE_DIR
48
+
49
+ # Pobieramy modele
50
+ pl_to_en_dir = snapshot_download(
51
+ "Helsinki-NLP/opus-mt-pl-en", token=hf_token, cache_dir=HF_CACHE_DIR
52
+ )
53
+ en_to_pl_dir = snapshot_download(
54
+ "gsarti/opus-mt-tc-en-pl", token=hf_token, cache_dir=HF_CACHE_DIR
55
+ )
56
 
57
+ # Ładujemy
58
+ pl_to_en_tok = MarianTokenizer.from_pretrained(pl_to_en_dir)
59
+ pl_to_en_mod = MarianMTModel.from_pretrained(pl_to_en_dir).to(device)
 
 
 
 
 
 
 
60
 
61
+ en_to_pl_tok = MarianTokenizer.from_pretrained(en_to_pl_dir)
62
+ en_to_pl_mod = MarianMTModel.from_pretrained(en_to_pl_dir).to(device)
63
+ # ────────────────────── schemy Pydantic ────────────────────
64
  class Comment(BaseModel):
65
  text: str
66
 
67
+
68
  class AspectSentiment(BaseModel):
69
  aspect: str
70
  sentiment: str
71
 
72
+
73
  class AnalysisResult(BaseModel):
74
  results: List[AspectSentiment]
75
 
76
+ # === Słownik aliasów aspektów EN→PL (taki sam jak wcześniej) ===
77
+ aspect_aliases = {
78
+ "food": "jedzenie", "service": "obsΕ‚uga", "price": "cena",
79
+ "taste": "smak", "waiter": "obsΕ‚uga", "dish": "danie",
80
+ "portion": "porcja", "staff": "obsΕ‚uga", "decor": "wystrΓ³j",
81
+ "menu": "menu", "drink": "napoje", "location": "lokalizacja",
82
+ "time": "czas oczekiwania", "cleanliness": "czystoΕ›Δ‡", "smell": "zapach",
83
+ "value": "cena", "experience": "doΕ›wiadczenie", "recommendation": "ogΓ³lna ocena",
84
+ "children": "dzieci", "family": "rodzina", "pet": "zwierzΔ™ta"
85
+ }
86
+ # ───────────────────── tΕ‚umaczenia ──────────────────────
87
  def translate_pl_to_en(texts: list[str]) -> list[str]:
88
+ inputs = pl_to_en_tok(texts,
89
+ return_tensors="pt",
90
+ padding=True,
91
+ truncation=True).to(device)
92
+ with torch.no_grad():
93
+ generated = pl_to_en_mod.generate(**inputs)
94
+ return pl_to_en_tok.batch_decode(generated, skip_special_tokens=True)
95
+
96
 
97
  def translate_en_to_pl(texts: list[str]) -> list[str]:
98
+ inputs = en_to_pl_tok(texts,
99
+ return_tensors="pt",
100
+ padding=True,
101
+ truncation=True).to(device)
102
+ with torch.no_grad():
103
+ generated = en_to_pl_mod.generate(**inputs)
104
+ return en_to_pl_tok.batch_decode(generated, skip_special_tokens=True)
105
+
106
 
107
  def extract_aspects(text_en: str):
108
+ inputs = aspect_tokenizer(
109
+ text_en, return_tensors="pt", truncation=True, padding=True
110
+ ).to(device)
111
  with torch.no_grad():
112
  outputs = aspect_model(**inputs)
113
 
 
130
  if current_tokens:
131
  aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
132
 
133
+ # ↓ usuΕ„ spacje z β€ž##” i zduplikowane wyniki
134
+ return list({tok.replace(" ##", "") for tok in aspects})
135
+
136
 
137
+ # ────────────────────── FastAPI ────────────────────────────
138
  app = FastAPI()
139
 
140
+
141
  @app.post("/analyze", response_model=AnalysisResult)
142
  def analyze_comment(comment: Comment):
143
  text_pl = comment.text
144
  text_en = translate_pl_to_en([text_pl])[0]
145
+ aspects = extract_aspects(text_en)
146
 
147
+ results: list[AspectSentiment] = []
148
+ for asp in aspects:
 
149
  input_text = f"{text_en} [SEP] {asp}"
150
+ inputs = sentiment_tokenizer(
151
+ input_text, return_tensors="pt", truncation=True, padding=True
152
+ ).to(device)
153
  with torch.no_grad():
154
  logits = sentiment_model(**inputs).logits
155
+ predicted_class_id = int(logits.argmax().cpu())
156
+ sentiment_label = {
157
+ 0: "negatywny",
158
+ 1: "neutralny",
159
+ 2: "pozytywny",
160
+ 3: "konfliktowy",
161
+ }[predicted_class_id]
162
+
163
+ asp_pl = aspect_aliases.get(asp, translate_en_to_pl([asp])[0].lower())
164
+ results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment_label))
165
+
166
+ return {"results": results}