Spaces:

ElPremOoO
/

CodeMateReadability

Sleeping

App Files Files Community

test locally

by ElPremOoO - opened May 1, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+48

-300530

This PR is in draft mode

Files changed (7) hide show

codebert_readability_scorer.pth +0 -3
main.py +48 -45
tokenizer_readability/merges.txt +0 -0
tokenizer_readability/special_tokens_map.json +0 -51
tokenizer_readability/tokenizer.json +0 -0
tokenizer_readability/tokenizer_config.json +0 -58
tokenizer_readability/vocab.json +0 -0

codebert_readability_scorer.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e0e0b83b0dc00e03dfc65c24acaf4b242bf97315f56247c4f6e6bc5ec9f0a50e
-size 498672601

main.py CHANGED Viewed

@@ -1,66 +1,69 @@
 from flask import Flask, request, jsonify
 import torch
-from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
 import os
 app = Flask(__name__)
-# Load model and tokenizer
-def load_model():
-    # Load saved config and weights
-    checkpoint = torch.load("codebert_readability_scorer.pth", map_location=torch.device('cpu'))
-    config = RobertaConfig.from_dict(checkpoint['config'])
-    # Initialize model with loaded config
-    model = RobertaForSequenceClassification(config)
-    model.load_state_dict(checkpoint['model_state_dict'])
-    model.eval()
-    return model
-# Load components
-try:
-    tokenizer = RobertaTokenizer.from_pretrained("./tokenizer_readability")
-    model = load_model()
-    print("Model and tokenizer loaded successfully!")
-except Exception as e:
-    print(f"Error loading model: {str(e)}")
 @app.route("/")
 def home():
     return request.url
-@app.route("/predict", methods=["POST"])
 def predict():
-    try:
-        # Get code from request body (JSON)
-        data = request.get_json()
-        if not data or "code" not in data:
-            return jsonify({"error": "Missing 'code' in request body"}), 400
-        code = data["code"]
-        # Tokenize input
-        inputs = tokenizer(
-            code,
-            truncation=True,
-            padding='max_length',
-            max_length=512,
-            return_tensors='pt'
-        )
-        # Make prediction
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Apply sigmoid and format score
-        score = torch.sigmoid(outputs.logits).item()
-        return jsonify({
-            "score": round(score, 4)
-        })
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

 from flask import Flask, request, jsonify
 import torch
+from transformers import RobertaTokenizer
 import os
+from transformers import RobertaForSequenceClassification
+import torch.serialization
+import torch
+from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
+from torch.utils.data import Dataset
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import numpy as np
+# Initialize Flask app
 app = Flask(__name__)
+# Load the trained model and tokenizer
+tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
+torch.serialization.add_safe_globals([RobertaForSequenceClassification])
+model = torch.load("model.pth", map_location=torch.device('cpu'), weights_only=False)  # Load the trained model
+# Ensure the model is in evaluation mode
+model.eval()
 @app.route("/")
 def home():
     return request.url
+# @app.route("/predict", methods=["POST"])
+@app.route("/predict")
 def predict():
+    print("Received code:", request.get_json()["code"])
+    code = request.get_json()["code"]
+    # Load saved weights and config
+    checkpoint = torch.load("codebert_vulnerability_scorer.pth")
+    config = RobertaConfig.from_dict(checkpoint['config'])
+    # Rebuild the model with correct architecture
+    model = RobertaForSequenceClassification(config)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    # Load tokenizer
+    tokenizer = RobertaTokenizer.from_pretrained('./tokenizer')
+    # Prepare input
+    inputs = tokenizer(
+        code,
+        truncation=True,
+        padding='max_length',
+        max_length=512,
+        return_tensors='pt'
+    )
+    # Make prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+    score = torch.sigmoid(outputs.logits).item()
+    return score
+# Run the Flask app
 if __name__ == "__main__":
+     app.run(host="0.0.0.0", port=7860)

tokenizer_readability/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_readability/special_tokens_map.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "cls_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "mask_token": {
-    "content": "<mask>",
-    "lstrip": true,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<pad>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "sep_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

tokenizer_readability/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_readability/tokenizer_config.json DELETED Viewed

@@ -1,58 +0,0 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": true,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "50264": {
-      "content": "<mask>",
-      "lstrip": true,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "cls_token": "<s>",
-  "eos_token": "</s>",
-  "errors": "replace",
-  "extra_special_tokens": {},
-  "mask_token": "<mask>",
-  "model_max_length": 512,
-  "pad_token": "<pad>",
-  "sep_token": "</s>",
-  "tokenizer_class": "RobertaTokenizer",
-  "trim_offsets": true,
-  "unk_token": "<unk>"
-}

tokenizer_readability/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff