Spaces:

Frorozcol
/

financIA

Build error

App Files Files Community

Frorozcol commited on Jun 1, 2023

Commit

9ee675e

1 Parent(s): 203292f

Load the app

Browse files

Files changed (11) hide show

app.py +10 -0
checkpoints/model.ckpt +3 -0
src/__init__.py +1 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/__pycache__/predict.cpython-310.pyc +0 -0
src/__pycache__/tokenizer.cpython-310.pyc +0 -0
src/dataset.py +46 -0
src/model.py +83 -0
src/predict.py +27 -0
src/tokenizer.py +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+from src import get_predict
+def main():
+    st.title("Aplicación de Streamlit")
+    texto = st.text_input("Ingresa un texto")
+    if texto:
+        resultado = get_predict(texto)
+        st.write("Resultado:", resultado)
+if __name__ == '__main__':
+    main()

checkpoints/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed370e19a5364d1bfa14f5f67b6b21c34b8181fbf5f4c91258f8b1aeab6ca18
+size 435270317

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .predict import *

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (191 Bytes). View file

src/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (3.61 kB). View file

src/__pycache__/predict.cpython-310.pyc ADDED Viewed

Binary file (1 kB). View file

src/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (643 Bytes). View file

src/dataset.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from torch.utils.data import Dataset, DataLoader
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    get_constant_schedule_with_warmup,
+)
+import torch
+class FinanciaSentimental(Dataset):
+    """This class is used to load the data and tokenize it"""
+    def __init__(self, tokenizer, dataframe, columns, max_len=512):
+        self.tokenizer = tokenizer
+        self.dataframe = dataframe
+        ## Columns to target
+        self._columns = columns
+        self.max_len = max_len
+    @property
+    def columns(self):
+        """Return the columns to target"""
+        return self._columns
+    def __len__(self):
+        """Return the length of the dataset"""
+        return len(self.dataframe)
+    def __getitem__(self, index):
+        """Get the data at the index"""
+        values = self.dataframe.iloc[index]
+        text = values['text']
+        label = values[self._columns].values.astype(np.float32)
+        inputs = self.tokenizer.encode_plus(text, max_length=130, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt')
+        label = torch.tensor(label, dtype=torch.float)
+        input_ids = inputs["input_ids"].squeeze().to(dtype=torch.long)
+        attention_mask = inputs["attention_mask"].squeeze().to(dtype=torch.long)
+        token_type_ids = inputs["token_type_ids"].squeeze().to(dtype=torch.long)
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "labels":label
+        }
+        return inputs_dict

src/model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import lightning.pytorch as pl
+from tqdm import tqdm
+from sklearn.metrics import f1_score, accuracy_score
+from torch.nn import BCEWithLogitsLoss
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    get_constant_schedule_with_warmup,
+)
+class FinanciaMultilabel(pl.LightningModule):
+    def __init__(self, model, num_labels):
+        super().__init__()
+        self.model = model
+        self.num_labels = num_labels
+        self.loss = BCEWithLogitsLoss()
+        self.validation_step_outputs = []
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        return self.model(input_ids, attention_mask, token_type_ids).logits
+    def training_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        token_type_ids = batch["token_type_ids"]
+        outputs = self(input_ids, attention_mask, token_type_ids)
+        loss = self.loss(outputs.view(-1,self.num_labels), labels.type_as(outputs).view(-1,self.num_labels))
+        self.log('train_loss', loss)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        labels = batch["labels"]
+        token_type_ids = batch["token_type_ids"]
+        outputs = self(input_ids, attention_mask, token_type_ids)
+        loss = self.loss(outputs.view(-1,self.num_labels), labels.type_as(outputs).view(-1,self.num_labels))
+        pred_labels = torch.sigmoid(outputs)
+        info = {'val_loss': loss, 'pred_labels': pred_labels, 'labels': labels}
+        self.validation_step_outputs.append(info)
+        return
+    def on_validation_epoch_end(self):
+        outputs = self.validation_step_outputs
+        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+        pred_labels = torch.cat([x['pred_labels'] for x in outputs])
+        labels = torch.cat([x['labels'] for x in outputs])
+        threshold = 0.50
+        pred_bools = pred_labels > threshold
+        true_bools = labels == 1
+        val_f1_accuracy = f1_score(true_bools.cpu(), pred_bools.cpu(), average='micro')*100
+        val_flat_accuracy = accuracy_score(true_bools.cpu(), pred_bools.cpu())*100
+        self.log('val_loss', avg_loss)
+        self.log('val_f1_accuracy', val_f1_accuracy, prog_bar=True)
+        self.log('val_flat_accuracy', val_flat_accuracy, prog_bar=True)
+        self.validation_step_outputs.clear()
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.parameters(), lr=2e-5)
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True, min_lr=1e-6)
+        return {
+                'optimizer': optimizer,
+                'lr_scheduler': {
+                    'scheduler': scheduler,
+                    'monitor': 'val_loss'
+                }
+            }
+def load_model(checkpoint_path, model, num_labels, device):
+    model_hugginface = AutoModelForSequenceClassification.from_pretrained(model, num_labels=num_labels, ignore_mismatched_sizes=True)
+    model = FinanciaMultilabel.load_from_checkpoint(
+        checkpoint_path,
+        model=model_hugginface,
+        num_labels=num_labels,
+        map_location=device
+    )
+    return model

src/predict.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from pathlib import Path
+import torch
+from .tokenizer import load_tokenizer, preprocessing_text
+from .model import load_model
+# CONFIG
+NUM_VARAIBLES = 3
+NUM_LABELS = 3
+num_labels = NUM_LABELS * NUM_VARAIBLES
+divice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+num_labels = NUM_LABELS * NUM_VARAIBLES
+model_name = "pysentimiento/robertuito-sentiment-analysis"
+checkpoint_path = Path(__file__).parent.parent / "checkpoints" / "model.ckpt"
+tokenizer = load_tokenizer(model_name)
+model = load_model(checkpoint_path, model_name, num_labels, divice)
+def get_predict(text):
+    inputs = preprocessing_text(text, tokenizer)
+    input_ids = inputs["input_ids"].to(divice)
+    attention_mask = inputs["attention_mask"].to(divice)
+    token_type_ids = inputs["token_type_ids"].to(divice)
+    outputs = model(input_ids, attention_mask, token_type_ids)
+    preds = torch.sigmoid(outputs).detach().cpu().numpy()
+    return preds

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import (
+    AutoTokenizer,
+)
+def load_tokenizer(model_tokenizer):
+    """Load the tokenizer"""
+    return AutoTokenizer.from_pretrained(model_tokenizer)
+def preprocessing_text(text, tokenizer):
+    """Tokenize the text"""
+    return tokenizer.encode_plus(text, max_length=130, pad_to_max_length=True, padding='max_length', truncation=True, return_tensors='pt')