asmashayea commited on
Commit
c7d107e
·
1 Parent(s): 4177079
Files changed (2) hide show
  1. app.py +4 -5
  2. inference.py +51 -49
app.py CHANGED
@@ -5,18 +5,17 @@ def run_absa(review, model_choice):
5
  try:
6
  return predict_absa(review, model_choice)
7
  except Exception as e:
8
- return {"error": str(e)} # JSON output for errors too
9
-
10
- # app
11
  demo = gr.Interface(
12
  fn=run_absa,
13
  inputs=[
14
  gr.Textbox(label="Arabic Review"),
15
  gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Model", value="Araberta")
16
  ],
17
- outputs=gr.JSON(label="Extracted Aspect-Sentiment-Opinion Triplets"), # ✅ JSON viewer
18
  title="Arabic ABSA (Aspect-Based Sentiment Analysis)",
19
- description="Choose a model (Araberta, mT5, GPT) to extract aspects, opinions, and sentiment using LoRA adapters"
20
  )
21
 
22
  if __name__ == "__main__":
 
5
  try:
6
  return predict_absa(review, model_choice)
7
  except Exception as e:
8
+ return {"error": str(e)}
9
+
 
10
  demo = gr.Interface(
11
  fn=run_absa,
12
  inputs=[
13
  gr.Textbox(label="Arabic Review"),
14
  gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Model", value="Araberta")
15
  ],
16
+ outputs=gr.JSON(label="Extracted Aspect-Sentiment-Opinion Triplets"),
17
  title="Arabic ABSA (Aspect-Based Sentiment Analysis)",
18
+ description="Choose a model (Araberta, mT5, mBART, GPT3.5, GPT4o) to extract aspects and sentiment"
19
  )
20
 
21
  if __name__ == "__main__":
inference.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  import json
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, AutoConfig
@@ -5,8 +6,15 @@ from peft import LoraConfig, get_peft_model, PeftModel
5
  from araberta_setting.modeling_bilstm_crf import BERT_BiLSTM_CRF
6
  from seq2seq_inference import infer_t5_prompt
7
  from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
8
 
9
- # Define supported models and their adapter IDs
10
  MODEL_OPTIONS = {
11
  "Araberta": {
12
  "base": "asmashayea/absa-araberta",
@@ -21,18 +29,20 @@ MODEL_OPTIONS = {
21
  "adapter": "asmashayea/mbart-absa"
22
  },
23
  "GPT3.5": {
24
- "base": "bigscience/bloom-560m", # placeholder
25
- "adapter": "asmashayea/gpt-absa"
26
  },
27
  "GPT4o": {
28
- "base": "bigscience/bloom-560m", # placeholder
29
- "adapter": "asmashayea/gpt-absa"
30
  }
31
  }
32
 
33
  cached_models = {}
34
 
35
-
 
 
36
  def load_araberta():
37
  path = "asmashayea/absa-arabert"
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -40,17 +50,14 @@ def load_araberta():
40
  tokenizer = AutoTokenizer.from_pretrained(path)
41
  base_model = AutoModel.from_pretrained(path)
42
 
43
- # Load LoRA adapter
44
  lora_config = LoraConfig.from_pretrained(path)
45
  lora_model = get_peft_model(base_model, lora_config)
46
 
47
- # Download CRF head from Hub
48
  local_pt = hf_hub_download(repo_id=path, filename="bilstm_crf_head.pt")
49
 
50
  config = AutoConfig.from_pretrained(path)
51
  model = BERT_BiLSTM_CRF(lora_model, config)
52
 
53
- # Always map to current device
54
  state_dict = torch.load(local_pt, map_location=torch.device(device))
55
  model.load_state_dict(state_dict)
56
  model.to(device).eval()
@@ -66,14 +73,7 @@ def infer_araberta(text):
66
  tokenizer, model = cached_models["Araberta"]
67
 
68
  device = next(model.parameters()).device
69
-
70
- inputs = tokenizer(
71
- text,
72
- return_tensors='pt',
73
- truncation=True,
74
- padding='max_length',
75
- max_length=128
76
- )
77
  input_ids = inputs['input_ids'].to(device)
78
  attention_mask = inputs['attention_mask'].to(device)
79
 
@@ -82,55 +82,38 @@ def infer_araberta(text):
82
  predicted_ids = outputs['logits'][0].cpu().tolist()
83
 
84
  tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
85
- predicted_labels = [model.config.id2label.get(p, 'O') for p in predicted_ids]
86
 
87
  clean_tokens = [t for t in tokens if t not in tokenizer.all_special_tokens]
88
  clean_labels = [l for t, l in zip(tokens, predicted_labels) if t not in tokenizer.all_special_tokens]
89
 
90
- # Horizontal token:label pairs
91
- pairs = [f"{token}: {label}" for token, label in zip(clean_tokens, clean_labels)]
92
- horizontal_output = " | ".join(pairs)
93
-
94
- # Group into aspect spans
95
- aspects = []
96
- current_tokens, current_sentiment = [], None
97
  for token, label in zip(clean_tokens, clean_labels):
98
  if label.startswith("B-"):
99
  if current_tokens:
100
- aspects.append({
101
- "aspect": " ".join(current_tokens).replace("##", ""),
102
- "sentiment": current_sentiment
103
- })
104
  current_tokens = [token]
105
  current_sentiment = label.split("-")[1]
106
  elif label.startswith("I-") and current_sentiment == label.split("-")[1]:
107
  current_tokens.append(token)
108
  else:
109
  if current_tokens:
110
- aspects.append({
111
- "aspect": " ".join(current_tokens).replace("##", ""),
112
- "sentiment": current_sentiment
113
- })
114
  current_tokens, current_sentiment = [], None
115
  if current_tokens:
116
- aspects.append({
117
- "aspect": " ".join(current_tokens).replace("##", ""),
118
- "sentiment": current_sentiment
119
- })
120
-
121
- return {
122
- "token_predictions": horizontal_output,
123
- "aspects": aspects
124
- }
125
 
 
126
 
 
 
 
127
  def load_model(model_key):
128
  if model_key in cached_models:
129
  return cached_models[model_key]
130
 
131
  base_id = MODEL_OPTIONS[model_key]["base"]
132
  adapter_id = MODEL_OPTIONS[model_key]["adapter"]
133
-
134
  device = "cuda" if torch.cuda.is_available() else "cpu"
135
 
136
  tokenizer = AutoTokenizer.from_pretrained(adapter_id)
@@ -141,14 +124,33 @@ def load_model(model_key):
141
  cached_models[model_key] = (tokenizer, model)
142
  return tokenizer, model
143
 
144
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  def predict_absa(text, model_choice):
146
  if model_choice in ['mT5', 'mBART']:
147
  tokenizer, model = load_model(model_choice)
148
- decoded = infer_t5_prompt(text, tokenizer, model)
149
  elif model_choice == 'Araberta':
150
- decoded = infer_araberta(text)
 
 
151
  else:
152
- decoded = {"error": f"Model {model_choice} not supported"}
153
-
154
- return decoded
 
1
+ import os
2
  import torch
3
  import json
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, AutoConfig
 
6
  from araberta_setting.modeling_bilstm_crf import BERT_BiLSTM_CRF
7
  from seq2seq_inference import infer_t5_prompt
8
  from huggingface_hub import hf_hub_download
9
+ from openai import OpenAI
10
+
11
+ # 🔑 OpenAI client (make sure OPENAI_API_KEY is set in Hugging Face Space secrets)
12
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
13
+
14
+ # Your fine-tuned OpenAI model IDs
15
+ GPT35_FINETUNED = "ft:gpt-3.5-turbo-0125:asma:gpt-3-5-turbo-absa:Bb6gmwkE"
16
+ GPT4O_FINETUNED = "ft:gpt-4o-mini-2024-07-18:asma:gpt4-finetune-absa:BazoEjnp"
17
 
 
18
  MODEL_OPTIONS = {
19
  "Araberta": {
20
  "base": "asmashayea/absa-araberta",
 
29
  "adapter": "asmashayea/mbart-absa"
30
  },
31
  "GPT3.5": {
32
+ "base": "openai",
33
+ "adapter": GPT35_FINETUNED
34
  },
35
  "GPT4o": {
36
+ "base": "openai",
37
+ "adapter": GPT4O_FINETUNED
38
  }
39
  }
40
 
41
  cached_models = {}
42
 
43
+ # ---------------------------
44
+ # Araberta loader
45
+ # ---------------------------
46
  def load_araberta():
47
  path = "asmashayea/absa-arabert"
48
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
50
  tokenizer = AutoTokenizer.from_pretrained(path)
51
  base_model = AutoModel.from_pretrained(path)
52
 
 
53
  lora_config = LoraConfig.from_pretrained(path)
54
  lora_model = get_peft_model(base_model, lora_config)
55
 
 
56
  local_pt = hf_hub_download(repo_id=path, filename="bilstm_crf_head.pt")
57
 
58
  config = AutoConfig.from_pretrained(path)
59
  model = BERT_BiLSTM_CRF(lora_model, config)
60
 
 
61
  state_dict = torch.load(local_pt, map_location=torch.device(device))
62
  model.load_state_dict(state_dict)
63
  model.to(device).eval()
 
73
  tokenizer, model = cached_models["Araberta"]
74
 
75
  device = next(model.parameters()).device
76
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
 
 
 
 
 
 
 
77
  input_ids = inputs['input_ids'].to(device)
78
  attention_mask = inputs['attention_mask'].to(device)
79
 
 
82
  predicted_ids = outputs['logits'][0].cpu().tolist()
83
 
84
  tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
85
+ predicted_labels = [model.id2label.get(p, 'O') for p in predicted_ids]
86
 
87
  clean_tokens = [t for t in tokens if t not in tokenizer.all_special_tokens]
88
  clean_labels = [l for t, l in zip(tokens, predicted_labels) if t not in tokenizer.all_special_tokens]
89
 
90
+ aspects, current_tokens, current_sentiment = [], [], None
 
 
 
 
 
 
91
  for token, label in zip(clean_tokens, clean_labels):
92
  if label.startswith("B-"):
93
  if current_tokens:
94
+ aspects.append({"aspect": " ".join(current_tokens).replace("##", ""), "sentiment": current_sentiment})
 
 
 
95
  current_tokens = [token]
96
  current_sentiment = label.split("-")[1]
97
  elif label.startswith("I-") and current_sentiment == label.split("-")[1]:
98
  current_tokens.append(token)
99
  else:
100
  if current_tokens:
101
+ aspects.append({"aspect": " ".join(current_tokens).replace("##", ""), "sentiment": current_sentiment})
 
 
 
102
  current_tokens, current_sentiment = [], None
103
  if current_tokens:
104
+ aspects.append({"aspect": " ".join(current_tokens).replace("##", ""), "sentiment": current_sentiment})
 
 
 
 
 
 
 
 
105
 
106
+ return {"aspects": aspects}
107
 
108
+ # ---------------------------
109
+ # Hugging Face seq2seq loaders
110
+ # ---------------------------
111
  def load_model(model_key):
112
  if model_key in cached_models:
113
  return cached_models[model_key]
114
 
115
  base_id = MODEL_OPTIONS[model_key]["base"]
116
  adapter_id = MODEL_OPTIONS[model_key]["adapter"]
 
117
  device = "cuda" if torch.cuda.is_available() else "cpu"
118
 
119
  tokenizer = AutoTokenizer.from_pretrained(adapter_id)
 
124
  cached_models[model_key] = (tokenizer, model)
125
  return tokenizer, model
126
 
127
+ # ---------------------------
128
+ # OpenAI inference
129
+ # ---------------------------
130
+ def infer_openai(text, model_name):
131
+ prompt = f"Extract aspects and their sentiment from this review:\n\n{text}\n\nReturn JSON with 'aspect' and 'sentiment'."
132
+ response = client.chat.completions.create(
133
+ model=model_name,
134
+ messages=[{"role": "user", "content": prompt}],
135
+ max_tokens=512,
136
+ temperature=0
137
+ )
138
+ output = response.choices[0].message.content.strip()
139
+ try:
140
+ return json.loads(output)
141
+ except:
142
+ return {"raw_output": output}
143
+
144
+ # ---------------------------
145
+ # Unified predictor
146
+ # ---------------------------
147
  def predict_absa(text, model_choice):
148
  if model_choice in ['mT5', 'mBART']:
149
  tokenizer, model = load_model(model_choice)
150
+ return infer_t5_prompt(text, tokenizer, model)
151
  elif model_choice == 'Araberta':
152
+ return infer_araberta(text)
153
+ elif model_choice in ['GPT3.5', 'GPT4o']:
154
+ return infer_openai(text, MODEL_OPTIONS[model_choice]["adapter"])
155
  else:
156
+ return {"error": f"Model {model_choice} not supported"}