EfektMotyla commited on
Commit
7ae4f2b
Β·
verified Β·
1 Parent(s): 94eceb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -101
app.py CHANGED
@@ -1,113 +1,61 @@
1
- from pathlib import Path
2
-
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel
5
  from typing import List
6
- from transformers import MarianMTModel, MarianTokenizer
7
  import torch
8
  from transformers import (
9
  AutoTokenizer,
10
  AutoModelForTokenClassification,
11
  AutoModelForSequenceClassification,
12
- pipeline,
13
  )
14
- import os
15
- from huggingface_hub import snapshot_download
16
 
17
  # ────────────────────── konfiguracja ──────────────────────
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
20
- ROOT = Path(__file__).parent
21
-
22
- aspect_dir = ROOT / "bert-aspect-ner"
23
- sentiment_dir = ROOT / "absa-roberta"
24
-
25
 
26
- device = "cuda" if torch.cuda.is_available() else "cpu"
27
- hf_token = os.getenv("HF_TOKEN")
28
- # ────────────────────── modele lokalne ─────────────────────
29
- aspect_tokenizer = AutoTokenizer.from_pretrained(
30
- str(aspect_dir), local_files_only=True, use_fast=False # ← jeΕ›li brak tokenizer.json
31
- )
32
- aspect_model = AutoModelForTokenClassification.from_pretrained(
33
- str(aspect_dir), local_files_only=True
34
- ).to(device)
35
 
36
- sentiment_tokenizer = AutoTokenizer.from_pretrained(
37
- str(sentiment_dir), local_files_only=True
38
- )
39
- sentiment_model = AutoModelForSequenceClassification.from_pretrained(
40
- str(sentiment_dir), local_files_only=True
41
- ).to(device)
42
-
43
- # ────────────────────── modele tΕ‚umaczeΕ„ (on-line) ─────────
44
- HF_CACHE_DIR = "/tmp/hf_cache"
45
- os.makedirs(HF_CACHE_DIR, exist_ok=True)
46
- os.environ["HF_HOME"] = HF_CACHE_DIR
47
- os.environ["TRANSFORMERS_CACHE"] = HF_CACHE_DIR
48
-
49
- # Pobieramy modele
50
- pl_to_en_dir = snapshot_download(
51
- "Helsinki-NLP/opus-mt-pl-en", token=hf_token, cache_dir=HF_CACHE_DIR
52
- )
53
- en_to_pl_dir = snapshot_download(
54
- "gsarti/opus-mt-tc-en-pl", token=hf_token, cache_dir=HF_CACHE_DIR
55
- )
56
 
57
- # Ładujemy
58
- pl_to_en_tok = MarianTokenizer.from_pretrained(pl_to_en_dir)
59
- pl_to_en_mod = MarianMTModel.from_pretrained(pl_to_en_dir).to(device)
 
 
 
 
 
 
 
60
 
61
- en_to_pl_tok = MarianTokenizer.from_pretrained(en_to_pl_dir)
62
- en_to_pl_mod = MarianMTModel.from_pretrained(en_to_pl_dir).to(device)
63
- # ────────────────────── schemy Pydantic ────────────────────
64
  class Comment(BaseModel):
65
  text: str
66
 
67
-
68
  class AspectSentiment(BaseModel):
69
  aspect: str
70
  sentiment: str
71
 
72
-
73
  class AnalysisResult(BaseModel):
74
  results: List[AspectSentiment]
75
 
76
- # === Słownik aliasów aspektów EN→PL (taki sam jak wcześniej) ===
77
- aspect_aliases = {
78
- "food": "jedzenie", "service": "obsΕ‚uga", "price": "cena",
79
- "taste": "smak", "waiter": "obsΕ‚uga", "dish": "danie",
80
- "portion": "porcja", "staff": "obsΕ‚uga", "decor": "wystrΓ³j",
81
- "menu": "menu", "drink": "napoje", "location": "lokalizacja",
82
- "time": "czas oczekiwania", "cleanliness": "czystoΕ›Δ‡", "smell": "zapach",
83
- "value": "cena", "experience": "doΕ›wiadczenie", "recommendation": "ogΓ³lna ocena",
84
- "children": "dzieci", "family": "rodzina", "pet": "zwierzΔ™ta"
85
- }
86
- # ───────────────────── tΕ‚umaczenia ──────────────────────
87
  def translate_pl_to_en(texts: list[str]) -> list[str]:
88
- inputs = pl_to_en_tok(texts,
89
- return_tensors="pt",
90
- padding=True,
91
- truncation=True).to(device)
92
- with torch.no_grad():
93
- generated = pl_to_en_mod.generate(**inputs)
94
- return pl_to_en_tok.batch_decode(generated, skip_special_tokens=True)
95
-
96
 
97
  def translate_en_to_pl(texts: list[str]) -> list[str]:
98
- inputs = en_to_pl_tok(texts,
99
- return_tensors="pt",
100
- padding=True,
101
- truncation=True).to(device)
102
- with torch.no_grad():
103
- generated = en_to_pl_mod.generate(**inputs)
104
- return en_to_pl_tok.batch_decode(generated, skip_special_tokens=True)
105
-
106
 
107
  def extract_aspects(text_en: str):
108
- inputs = aspect_tokenizer(
109
- text_en, return_tensors="pt", truncation=True, padding=True
110
- ).to(device)
111
  with torch.no_grad():
112
  outputs = aspect_model(**inputs)
113
 
@@ -130,37 +78,32 @@ def extract_aspects(text_en: str):
130
  if current_tokens:
131
  aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
132
 
133
- # ↓ usuΕ„ spacje z β€ž##” i zduplikowane wyniki
134
- return list({tok.replace(" ##", "") for tok in aspects})
135
-
136
 
137
- # ────────────────────── FastAPI ────────────────────────────
138
  app = FastAPI()
139
 
140
-
141
  @app.post("/analyze", response_model=AnalysisResult)
142
  def analyze_comment(comment: Comment):
143
  text_pl = comment.text
144
  text_en = translate_pl_to_en([text_pl])[0]
145
- aspects = extract_aspects(text_en)
146
 
147
- results: list[AspectSentiment] = []
148
- for asp in aspects:
 
149
  input_text = f"{text_en} [SEP] {asp}"
150
- inputs = sentiment_tokenizer(
151
- input_text, return_tensors="pt", truncation=True, padding=True
152
- ).to(device)
153
  with torch.no_grad():
154
  logits = sentiment_model(**inputs).logits
155
- predicted_class_id = int(logits.argmax().cpu())
156
- sentiment_label = {
157
- 0: "negatywny",
158
- 1: "neutralny",
159
- 2: "pozytywny",
160
- 3: "konfliktowy",
161
- }[predicted_class_id]
162
-
163
- asp_pl = aspect_aliases.get(asp, translate_en_to_pl([asp])[0].lower())
164
- results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment_label))
165
-
166
- return {"results": results}
 
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from typing import List
 
4
  import torch
5
  from transformers import (
6
  AutoTokenizer,
7
  AutoModelForTokenClassification,
8
  AutoModelForSequenceClassification,
9
+ pipeline
10
  )
 
 
11
 
12
  # ────────────────────── konfiguracja ──────────────────────
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
+ # Lokalne modele
16
+ aspect_tokenizer = AutoTokenizer.from_pretrained("bert-aspect-ner", local_files_only=True, use_fast=False)
17
+ aspect_model = AutoModelForTokenClassification.from_pretrained("bert-aspect-ner", local_files_only=True).to(device)
18
+ aspect_model.eval()
 
19
 
20
+ sentiment_tokenizer = AutoTokenizer.from_pretrained("absa-roberta", local_files_only=True)
21
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained("absa-roberta", local_files_only=True).to(device)
22
+ sentiment_model.eval()
 
 
 
 
 
 
23
 
24
+ # TΕ‚umaczenia
25
+ pl_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-pl-en", device=0 if torch.cuda.is_available() else -1)
26
+ en_to_pl = pipeline("translation", model="gsarti/opus-mt-tc-en-pl", device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Alias sΕ‚ownik
29
+ aspect_aliases = {
30
+ "food": "jedzenie", "service": "obsΕ‚uga", "price": "cena",
31
+ "taste": "smak", "waiter": "obsΕ‚uga", "dish": "danie",
32
+ "portion": "porcja", "staff": "obsΕ‚uga", "decor": "wystrΓ³j",
33
+ "menu": "menu", "drink": "napoje", "location": "lokalizacja",
34
+ "time": "czas oczekiwania", "cleanliness": "czystoΕ›Δ‡", "smell": "zapach",
35
+ "value": "cena", "experience": "doΕ›wiadczenie", "recommendation": "ogΓ³lna ocena",
36
+ "children": "dzieci", "family": "rodzina", "pet": "zwierzΔ™ta"
37
+ }
38
 
39
+ # ────────────────────── Pydantic ──────────────────────
 
 
40
  class Comment(BaseModel):
41
  text: str
42
 
 
43
  class AspectSentiment(BaseModel):
44
  aspect: str
45
  sentiment: str
46
 
 
47
  class AnalysisResult(BaseModel):
48
  results: List[AspectSentiment]
49
 
50
+ # ────────────────────── logika ──────────────────────
 
 
 
 
 
 
 
 
 
 
51
  def translate_pl_to_en(texts: list[str]) -> list[str]:
52
+ return [r['translation_text'] for r in pl_to_en(texts)]
 
 
 
 
 
 
 
53
 
54
  def translate_en_to_pl(texts: list[str]) -> list[str]:
55
+ return [r['translation_text'] for r in en_to_pl(texts)]
 
 
 
 
 
 
 
56
 
57
  def extract_aspects(text_en: str):
58
+ inputs = aspect_tokenizer(text_en, return_tensors="pt", truncation=True, padding=True).to(device)
 
 
59
  with torch.no_grad():
60
  outputs = aspect_model(**inputs)
61
 
 
78
  if current_tokens:
79
  aspects.append(aspect_tokenizer.convert_tokens_to_string(current_tokens).strip())
80
 
81
+ return list({tok.replace(" ##", "").strip() for tok in aspects})
 
 
82
 
83
+ # ────────────────────── FastAPI ──────────────────────
84
  app = FastAPI()
85
 
 
86
  @app.post("/analyze", response_model=AnalysisResult)
87
  def analyze_comment(comment: Comment):
88
  text_pl = comment.text
89
  text_en = translate_pl_to_en([text_pl])[0]
90
+ aspects_en = extract_aspects(text_en)
91
 
92
+ results = []
93
+ seen = set()
94
+ for asp in aspects_en:
95
  input_text = f"{text_en} [SEP] {asp}"
96
+ inputs = sentiment_tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
 
 
97
  with torch.no_grad():
98
  logits = sentiment_model(**inputs).logits
99
+ pred = int(torch.argmax(logits, dim=1).cpu())
100
+ sentiment = ["negatywny", "neutralny", "pozytywny", "konfliktowy"][pred]
101
+
102
+ asp_lower = asp.lower()
103
+ asp_pl = aspect_aliases.get(asp_lower, translate_en_to_pl([asp])[0].lower())
104
+
105
+ if asp_pl not in seen:
106
+ seen.add(asp_pl)
107
+ results.append(AspectSentiment(aspect=asp_pl, sentiment=sentiment))
108
+
109
+ return {"results": results}