Spaces:

asmashayea
/

absa-app

Sleeping

App Files Files Community

asmashayea commited on Sep 3, 2025

Commit

6d91ffe

1 Parent(s): 4308dad

Add application file

Browse files

Files changed (5) hide show

app.py +23 -0
araberta_setting/modeling_bilstm_crf.py +45 -0
inference.py +157 -0
requirements.txt +9 -0
seq2seq_inference.py +50 -0

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import gradio as gr
+from inference import predict_absa, MODEL_OPTIONS
+def run_absa(review, model_choice):
+    try:
+        return predict_absa(review, model_choice)
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+demo = gr.Interface(
+    fn=run_absa,
+    inputs=[
+        gr.Textbox(label="Arabic Review"),
+        gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Choose Model", value="mT5")
+    ],
+    outputs=gr.Textbox(label="Extracted Aspect-Sentiment-Opinion Triplets"),
+    title="Arabic ABSA (Aspect-Based Sentiment Analysis)",
+    description="Choose a model (Araberta, mT5, GPT) to extract aspects, opinions, and sentiment using LoRA adapters"
+)
+if __name__ == "__main__":
+    demo.launch()

araberta_setting/modeling_bilstm_crf.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+from torchcrf import CRF
+class BERT_BiLSTM_CRF(nn.Module):
+    def __init__(self, base_model, config, dropout_rate=0.2, rnn_dim=256):
+        super().__init__()
+        self.bert = base_model
+        self.label2id = config.label2id  # <-- pulled from config
+        self.id2label = config.id2label
+        self.num_labels = config.num_labels
+        self.bilstm = nn.LSTM(
+            self.bert.config.hidden_size,
+            rnn_dim,
+            num_layers=2,
+            batch_first=True,
+            bidirectional=True,
+            dropout=0.2
+        )
+        self.dropout = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(rnn_dim * 2, self.num_labels)
+        self.crf = CRF(self.num_labels, batch_first=True)
+    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids
+        )
+        lstm_out, _ = self.bilstm(self.dropout(outputs.last_hidden_state))
+        emissions = self.classifier(lstm_out)
+        mask = attention_mask.bool()
+        if labels is not None:
+            safe_labels = labels.clone()
+            safe_labels[labels == -100] = self.label2id['O']
+            loss = -self.crf(emissions, safe_labels, mask=mask, reduction='mean')
+            return {'loss': loss, 'logits': emissions}
+        else:
+            decoded = self.crf.decode(emissions, mask=mask)
+            max_len = input_ids.shape[1]
+            padded_decoded = [seq + [0] * (max_len - len(seq)) for seq in decoded]
+            logits = torch.tensor(padded_decoded, device=input_ids.device)
+            return {'logits': logits}

inference.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import torch
+import json
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, AutoConfig
+from peft import LoraConfig, get_peft_model, PeftModel
+from araberta_setting.modeling_bilstm_crf import BERT_BiLSTM_CRF
+from seq2seq_inference import infer_t5_prompt
+from huggingface_hub import hf_hub_download
+# Define supported models and their adapter IDs
+MODEL_OPTIONS = {
+    "Araberta": {
+        "base": "asmashayea/absa-araberta",
+        "adapter": "asmashayea/absa-araberta"
+    },
+    "mT5": {
+        "base": "google/mt5-base",
+        "adapter": "asmashayea/mt4-absa"
+    },
+    "mBART": {
+        "base": "facebook/mbart-large-50-many-to-many-mmt",
+        "adapter": "asmashayea/mbart-absa"
+    },
+    "GPT3.5": {
+        "base": "bigscience/bloom-560m",  # example, not ideal for ABSA
+        "adapter": "asmashayea/gpt-absa"
+    },
+    "GPT4o": {
+        "base": "bigscience/bloom-560m",  # example, not ideal for ABSA
+        "adapter": "asmashayea/gpt-absa"
+    }
+}
+cached_models = {}
+def load_araberta():
+    path = "asmashayea/absa-arabert"
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    base_model = AutoModel.from_pretrained(path)
+    lora_config = LoraConfig.from_pretrained(path)
+    lora_model = get_peft_model(base_model, lora_config)
+    local_pt = hf_hub_download(repo_id="asmashayea/absa-arabert", filename="bilstm_crf_head.pt")
+    config = AutoConfig.from_pretrained(path)
+    model = BERT_BiLSTM_CRF(lora_model, config)
+    model.load_state_dict(torch.load(local_pt))
+    model.eval()
+    cached_models["Araberta"] = (tokenizer, model)
+    return tokenizer, model
+def infer_araberta(text):
+    if "Araberta" not in cached_models:
+        tokenizer, model = load_araberta()
+    else:
+        tokenizer, model = cached_models["Araberta"]
+    device = next(model.parameters()).device
+    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
+    input_ids = inputs['input_ids'].to(device)
+    attention_mask = inputs['attention_mask'].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        predicted_ids = outputs['logits'][0].cpu().tolist()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
+    predicted_labels = [model.config.id2label.get(p, 'O') for p in predicted_ids]
+    clean_tokens = [t for t in tokens if t not in tokenizer.all_special_tokens]
+    clean_labels = [l for t, l in zip(tokens, predicted_labels) if t not in tokenizer.all_special_tokens]
+    # Horizontal output
+    pairs = [f"{token}: {label}" for token, label in zip(clean_tokens, clean_labels)]
+    horizontal_output = " | ".join(pairs)
+    # Group by aspect span
+    aspects = []
+    current_tokens = []
+    current_sentiment = None
+    for token, label in zip(clean_tokens, clean_labels):
+        if label.startswith("B-"):
+            if current_tokens:
+                aspects.append({
+                    "aspect": " ".join(current_tokens).replace("##", ""),
+                    "sentiment": current_sentiment
+                })
+            current_tokens = [token]
+            current_sentiment = label.split("-")[1]
+        elif label.startswith("I-") and current_sentiment == label.split("-")[1]:
+            current_tokens.append(token)
+        else:
+            if current_tokens:
+                aspects.append({
+                    "aspect": " ".join(current_tokens).replace("##", ""),
+                    "sentiment": current_sentiment
+                })
+                current_tokens = []
+                current_sentiment = None
+    if current_tokens:
+        aspects.append({
+            "aspect": " ".join(current_tokens).replace("##", ""),
+            "sentiment": current_sentiment
+        })
+    return {
+        "token_predictions": horizontal_output,
+        "aspects": aspects
+    }
+def load_model(model_key):
+    if model_key in cached_models:
+        return cached_models[model_key]
+    base_id = MODEL_OPTIONS[model_key]["base"]
+    adapter_id = MODEL_OPTIONS[model_key]["adapter"]
+    tokenizer = AutoTokenizer.from_pretrained(adapter_id)
+    base_model = AutoModelForSeq2SeqLM.from_pretrained(base_id)
+    model = PeftModel.from_pretrained(base_model, adapter_id)
+    model.eval()
+    cached_models[model_key] = (tokenizer, model)
+    return tokenizer, model
+def predict_absa(text, model_choice):
+    if model_choice in ['mT5', 'mBART']:
+        tokenizer, model = load_model(model_choice)
+        decoded = infer_t5_prompt(text, tokenizer, model)
+    elif model_choice == 'Araberta':
+        decoded = infer_araberta(text)
+    # prompt = f"استخرج الجوانب والآراء والمشاعر من النص التالي:\n{text}"
+    # inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+    # with torch.no_grad():
+    #     outputs = model.generate(**inputs, max_new_tokens=128)
+    # decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return decoded

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+transformers
+gradio
+peft
+torchcrf
+torch
+torchvision
+torchaudio
+pytorch-crf
+sentencepiece

seq2seq_inference.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from peft import PeftModel
+SYSTEM_PROMPT = (
+    "You are an advanced AI model specialized in extracting aspects and determining their sentiment polarity from customer reviews.\n\n"
+    "Instructions:\n"
+    "1. Extract only the aspects (nouns) mentioned in the review.\n"
+    "2. Assign a sentiment to each aspect: \"positive\", \"negative\", or \"neutral\".\n"
+    "3. Return aspects in the same language as they appear.\n"
+    "4. An aspect must be a noun that refers to a specific item or service the user described.\n"
+    "5. Ignore adjectives, general ideas, and vague topics.\n"
+    "6. Do NOT translate, explain, or add extra text.\n"
+    "7. The output must be just a valid JSON list with 'aspect' and 'sentiment'. Start with `[` and stop at `]`.\n"
+    "8. Do NOT output the instructions, review, or any text — only one output JSON list.\n"
+    "9. Just one output and one review."
+)
+def infer_t5_prompt(review_text, tokenizer, peft_model):
+    prompt = SYSTEM_PROMPT + f"\n\nReview: {review_text}"
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(peft_model.device)
+    with torch.no_grad():
+        outputs = peft_model.generate(
+            **inputs,
+            max_new_tokens=256,
+            num_beams=4,
+            do_sample=False,
+            temperature=0.0,
+            early_stopping=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    decoded = tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    ).strip()
+    decoded = decoded.replace('<extra_id_0>', '').replace('</s>', '').strip()
+    try:
+        return json.loads(decoded)
+    except json.JSONDecodeError:
+        return decoded