oeg
/

RoBERTaSense-FACIL

@@ -3,137 +3,158 @@ identifier: https://huggingface.co/oeg/RoBERTaSense-FACIL
 name: RoBERTaSense-FACIL
 version: 0.1.0
 keywords:
-  - easy-to-read
-  - meaning preservation
-  - accessibility
-  - spanish
-  - text pair classification
-headline: "Spanish RoBERTa fine-tuned to assess meaning preservation in Easy-to-Read (E2R) adaptations."
 description: >
-  RoBERTaSense-FACIL is a Spanish RoBERTa model fine-tuned to assess meaning preservation in
-  Easy-to-Read (E2R) adaptations. Given a pair {original, adapted}, it predicts whether the adaptation
-  preserves the meaning of the original.
-  ⚠️ Deprecation notice (base model): fine-tuned from PlanTL-GOB-ES/roberta-base-bne, which is deprecated
-  as of 2025. For actively maintained Spanish RoBERTa models, see BSC-LT.
 task:
-  - Text classification
-  - Pairwise classification
 modelCategory:
-  - Supervised classification
 language:
-  - es
 license: apache-2.0
-parameterSize: "125M"
 developmentStatus: Active
-dateCreated: "06-10-2025"
-dateModified: "06-10-2025"
 citation: >
-  Diab Lozano, I., & Suárez-Figueroa, M. C. (2025). RoBERTaSense-FACIL: Meaning Preservation for
-  Easy-to-Read in Spanish. Retrieved from https://huggingface.co/oeg/RoBERTaSense-FACIL
-codeRepository: ""
-referencePublication: ""
-developmentLibrary: "PyTorch + Transformers"
 usageInstructions: >
   from transformers import AutoTokenizer, AutoModelForSequenceClassification
   import torch
-  repo = "oeg/RoBERTaSense-FACIL"
-  model = AutoModelForSequenceClassification.from_pretrained(repo)
-  tokenizer = AutoTokenizer.from_pretrained(repo)
-  original = "El lobo, que parecía amable, engañó a Caperucita."
-  adapted  = "El lobo parecía amable. El lobo engañó a Caperucita."
-  inputs = tokenizer(original, adapted, return_tensors="pt", truncation=True, max_length=512)
-  with torch.no_grad():
       logits = model(**inputs).logits
-  probs = logits.softmax(-1).squeeze().tolist()
-  print({model.config.id2label[i]: probs[i] for i in range(len(probs))})
 modelRisks:
-  - "Trained for Spanish E2R; out-of-domain performance may degrade."
-  - "Binary labels compress nuanced cases; borderline adaptations may require human review."
-  - "Synthetic negatives do not cover all real-world human errors."
-  - "Base model is deprecated; security/robustness updates will not be inherited."
 evaluationMetrics:
-  - Accuracy
-  - F1
-  - ROC-AUC
-evaluationResults: >
   80/20 stratified split (seed=42). Example results:
     - Accuracy: 0.81
     - F1: 0.84
     - ROC-AUC: 0.83
 softwareRequirements:
-  - "python>=3.9"
-  - "torch>=2.0"
-  - "transformers>=4.40"
-  - "datasets>=2.18"
 storageRequirements:
-  - "~500 MB"
 memoryRequirements:
-  - ">= 8 GB RAM (CPU inference), >= 12 GB VRAM recommended for large batch inference"
 operatingSystem:
-  - Linux
-  - macOS
-  - Windows
 processorRequirements:
-  - "x86_64 CPU (AVX recommended)"
 GPURequirements:
-  - "Not required for single-pair inference; CUDA GPU recommended for batch processing"
 distribution:
-  - encodingFormat: ""
-    contentUrl: ""
-    contentSize: ""
-    quantizationBits: ""
-    quantizationMethod: ""
 trainedOn:
-  - identifier: internal:e2r-positives
-    name: Expert-validated E2R pairs (Spanish)
-    description: >
-      Positive pairs (original↔adapted) from an existing corpus validated by experts; used as the positive class.
-    url: ""
-  - identifier: internal:synthetic-negatives
-    name: Synthetic hard negatives (Spanish)
-    description: >
-      Negatives generated via sentence shuffle, dropout, mismatch (derangement), paraphrase-with-distortion,
-      and zero-shot NLI contradictions; trivial pairs filtered by BLEU/ROUGE-L thresholds.
-    url: ""
 testedOn:
-  - identifier: internal:heldout-20
-    name: Held-out 20% stratified split
-    description: >
-      Stratified 80/20 split by Label (seed=42); pairwise tokenization up to 512 tokens.
 evaluatedOn:
-  - identifier: internal:heldout-20
-    name: Held-out 20% stratified split
-    description: >
-      Metrics: Accuracy, F1, ROC-AUC; operating threshold tuned via Youden’s J (ROC).
-validatedOn: ""
 author:
-  - name: Isam Diab Lozano
-    identifier: "https://orcid.org/0000-0002-3967-0672"
-  - name: Mari Carmen Suárez-Figueroa
-    identifier: "https://orcid.org/0000-0003-3807-5019"
-successorOf: ""
 funder:
-  - name: "Comunidad de Madrid — PIPF-2022/COM-25762"
-    identifier: ""
 sharedBy:
-  - name: "Ontology Engineering Group (UPM)"
-    identifier: "https://oeg.fi.upm.es/index.php/en/index.html"
 wasGeneratedBy:
-  - trainingRegion:
-    - name: "Europe (West)"
-    cloudProvider:
-    - name: ""
-      url: ""
-    duration: ""
-    hardwareType: ""
-fineTunedFromModel: "https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne"
 sdPublisher:
-  - name: "Ontology Engineering Group"
-    url: "https://oeg.fi.upm.es/index.php/en/index.html"
 sdLicense: apache-2.0
 ---
 ## Model Card for RoBERTaSense-FACIL

 name: RoBERTaSense-FACIL
 version: 0.1.0
 keywords:
+- easy-to-read
+- meaning preservation
+- accessibility
+- spanish
+- text pair classification
+headline: >-
+  Spanish RoBERTa fine-tuned to assess meaning preservation in Easy-to-Read
+  (E2R) adaptations.
 description: >
+  RoBERTaSense-FACIL is a Spanish RoBERTa model fine-tuned to assess meaning
+  preservation in Easy-to-Read (E2R) adaptations. Given a pair {original,
+  adapted}, it predicts whether the adaptation preserves the meaning of the
+  original. ⚠️ Deprecation notice (base model): fine-tuned from
+  PlanTL-GOB-ES/roberta-base-bne, which is deprecated as of 2025. For actively
+  maintained Spanish RoBERTa models, see BSC-LT.
 task:
+- Text classification
+- Pairwise classification
 modelCategory:
+- Supervised classification
 language:
+- es
 license: apache-2.0
+parameterSize: 125M
 developmentStatus: Active
+dateCreated: 25-09-2025
+dateModified: 06-10-2025
 citation: >
+  Diab Lozano, I., & Suárez-Figueroa, M. C. (2025). RoBERTaSense-FACIL: Meaning
+  Preservation for Easy-to-Read in Spanish. Retrieved from
+  https://huggingface.co/oeg/RoBERTaSense-FACIL
+codeRepository: ''
+referencePublication: ''
+developmentLibrary: PyTorch + Transformers
 usageInstructions: >
   from transformers import AutoTokenizer, AutoModelForSequenceClassification
   import torch
+  repo = "oeg/RoBERTaSense-FACIL" model =
+  AutoModelForSequenceClassification.from_pretrained(repo) tokenizer =
+  AutoTokenizer.from_pretrained(repo)
+  original = "El lobo, que parecía amable, engañó a Caperucita." adapted  = "El
+  lobo parecía amable. El lobo engañó a Caperucita."
+  inputs = tokenizer(original, adapted, return_tensors="pt", truncation=True,
+  max_length=512) with torch.no_grad():
       logits = model(**inputs).logits
+  probs = logits.softmax(-1).squeeze().tolist() print({model.config.id2label[i]:
+  probs[i] for i in range(len(probs))})
 modelRisks:
+- Trained for Spanish E2R; out-of-domain performance may degrade.
+- >-
+  Binary labels compress nuanced cases; borderline adaptations may require human
+  review.
+- Synthetic negatives do not cover all real-world human errors.
+- Base model is deprecated; security/robustness updates will not be inherited.
 evaluationMetrics:
+- Accuracy
+- F1
+- ROC-AUC
+evaluationResults: |
   80/20 stratified split (seed=42). Example results:
     - Accuracy: 0.81
     - F1: 0.84
     - ROC-AUC: 0.83
 softwareRequirements:
+- python>=3.9
+- torch>=2.0
+- transformers>=4.40
+- datasets>=2.18
 storageRequirements:
+- ~500 MB
 memoryRequirements:
+- >-
+  >= 8 GB RAM (CPU inference), >= 12 GB VRAM recommended for large batch
+  inference
 operatingSystem:
+- Linux
+- macOS
+- Windows
 processorRequirements:
+- x86_64 CPU (AVX recommended)
 GPURequirements:
+- >-
+  Not required for single-pair inference; CUDA GPU recommended for batch
+  processing
 distribution:
+- encodingFormat: ''
+  contentUrl: ''
+  contentSize: ''
+  quantizationBits: ''
+  quantizationMethod: ''
 trainedOn:
+- identifier: internal:e2r-positives
+  name: Expert-validated E2R pairs (Spanish)
+  description: >
+    Positive pairs (original↔adapted) from an existing corpus validated by
+    experts; used as the positive class.
+  url: ''
+- identifier: internal:synthetic-negatives
+  name: Synthetic hard negatives (Spanish)
+  description: >
+    Negatives generated via sentence shuffle, dropout, mismatch (derangement),
+    paraphrase-with-distortion, and zero-shot NLI contradictions; trivial pairs
+    filtered by BLEU/ROUGE-L thresholds.
+  url: ''
 testedOn:
+- identifier: internal:heldout-20
+  name: Held-out 20% stratified split
+  description: >
+    Stratified 80/20 split by Label (seed=42); pairwise tokenization up to 512
+    tokens.
 evaluatedOn:
+- identifier: internal:heldout-20
+  name: Held-out 20% stratified split
+  description: >
+    Metrics: Accuracy, F1, ROC-AUC; operating threshold tuned via Youden’s J
+    (ROC).
+validatedOn: ''
 author:
+- name: Isam Diab Lozano
+  identifier: https://orcid.org/0000-0002-3967-0672
+- name: Mari Carmen Suárez-Figueroa
+  identifier: https://orcid.org/0000-0003-3807-5019
+successorOf: ''
 funder:
+- name: Comunidad de Madrid — PIPF-2022/COM-25762
+  identifier: ''
 sharedBy:
+- name: Ontology Engineering Group (UPM)
+  identifier: https://oeg.fi.upm.es/index.php/en/index.html
 wasGeneratedBy:
+- trainingRegion:
+  - name: Europe (West)
+  cloudProvider:
+  - name: ''
+    url: ''
+  duration: ''
+  hardwareType: ''
+fineTunedFromModel: https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne
 sdPublisher:
+- name: Ontology Engineering Group
+  url: https://oeg.fi.upm.es/index.php/en/index.html
 sdLicense: apache-2.0
+metrics:
+- accuracy
+- f1
+- roc_auc
+base_model:
+- PlanTL-GOB-ES/roberta-base-bne
+pipeline_tag: text-classification
 ---
 ## Model Card for RoBERTaSense-FACIL