Spaces:

ngocminhta
/

falcon-api

Running

App Files Files Community

ngocminhta commited on May 8, 2025

Commit

5b2f797

1 Parent(s): 6b4e43b

update batch process

Browse files

Files changed (6) hide show

app.py +7 -16
infer.py +10 -28
unsup-simcse-xlm-roberta-base/.DS_Store +0 -0
unsup-simcse-xlm-roberta-base/config.json +0 -27
unsup-simcse-xlm-roberta-base/tokenizer.json +0 -0
unsup-simcse-xlm-roberta-base/trainer_state.json +0 -40

app.py CHANGED Viewed

@@ -43,12 +43,6 @@ def load_model_resources():
     global model, tokenizer, index, label_dict, is_mixed_dict
     model = TextEmbeddingModel(opt.model_name)
-    # state_dict = torch.load(opt.model_path, map_location=model.model.device)
-    # new_state_dict={}
-    # for key in state_dict.keys():
-    #     if key.startswith('model.'):
-    #         new_state_dict[key[6:]]=state_dict[key]
-    # model.load_state_dict(state_dict)
     tokenizer=model.tokenizer
     index = Indexer(opt.embedding_dim)
@@ -64,16 +58,13 @@ async def predict(request: Request):
     text_list = data.get("text", [])
     if mode == "normal":
-        results = []
-        for text in text_list:
-            result = infer_3_class(model=model,
-                tokenizer=tokenizer,
-                index=index,
-                label_dict=label_dict,
-                is_mixed_dict=is_mixed_dict,
-                text=text,
-                K=20)
-            results.append(result)
         return JSONResponse(content={"results": results})
     elif mode == "advanced":
         return 0

     global model, tokenizer, index, label_dict, is_mixed_dict
     model = TextEmbeddingModel(opt.model_name)
     tokenizer=model.tokenizer
     index = Indexer(opt.embedding_dim)
     text_list = data.get("text", [])
     if mode == "normal":
+        results = infer_3_class(model=model,
+            tokenizer=tokenizer,
+            index=index,
+            label_dict=label_dict,
+            is_mixed_dict=is_mixed_dict,
+            text_list=text_list,
+            K=20)
         return JSONResponse(content={"results": results})
     elif mode == "advanced":
         return 0

infer.py CHANGED Viewed

@@ -44,7 +44,7 @@ def load_pkl(path):
     with open(path, 'rb') as f:
         return pickle.load(f)
-def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text, K):
     # model = TextEmbeddingModel(opt.model_name).cuda()
     # state_dict = torch.load(opt.model_path, map_location=model.model.device)
     # new_state_dict={}
@@ -61,7 +61,7 @@ def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text, K):
     # text = opt.text
     encoded_text = tokenizer.batch_encode_plus(
-                        [text],
                         return_tensors="pt",
                         max_length=512,
                         padding="max_length",
@@ -89,32 +89,14 @@ def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text, K):
             boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
             fuzzy_cnt[label] += weight * boost
-        # final = max(fuzzy_cnt, key=fuzzy_cnt.get)
-        # print(f"Top {opt.K} results for text:")
-        # cnt = {(1,0):0,(0,10^3):0,(1,1):0}
-        # for j, (id, score) in enumerate(zip(ids, scores)):
-        #     print(f"{j+1}. ID {id} Label {label_dict[int(id)]} Is_mixed {is_mixed_dict[int(id)]} Score {score}")
-        #     cnt[(label_dict[int(id)], is_mixed_dict[int(id)])]+=1
-        # final = max(cnt, key=cnt.get)
-        # pred.append(final)
-        # if final==(1,0):
-        #     print("Human")
-        #     return 0
-        # elif final==(0,10^3):
-        #     print("AI")
-        #     return 1
-        # else:
-        #     print("Mixed")
-        #     return 2
-        # pred.append(final)
-        total_score = sum(fuzzy_cnt.values())
-        final = dict()
-        final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
-        final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
-        final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
-        print(f"Final prediction: {final}")
-    return final
 if __name__ == "__main__":

     with open(path, 'rb') as f:
         return pickle.load(f)
+def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K):
     # model = TextEmbeddingModel(opt.model_name).cuda()
     # state_dict = torch.load(opt.model_path, map_location=model.model.device)
     # new_state_dict={}
     # text = opt.text
     encoded_text = tokenizer.batch_encode_plus(
+                        text_list,
                         return_tensors="pt",
                         max_length=512,
                         padding="max_length",
             boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
             fuzzy_cnt[label] += weight * boost
+            total_score = sum(fuzzy_cnt.values())
+            final = dict()
+            final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
+            final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
+            final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
+            print(f"Final prediction: {final}")
+            pred.append(final)
+    return pred
 if __name__ == "__main__":

unsup-simcse-xlm-roberta-base/.DS_Store DELETED Viewed

Binary file (6.15 kB)

unsup-simcse-xlm-roberta-base/config.json DELETED Viewed

@@ -1,27 +0,0 @@
-{
-  "_name_or_path": "xlm-roberta-base",
-  "architectures": [
-    "RobertaForCL"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 514,
-  "model_type": "xlm-roberta",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "output_past": true,
-  "pad_token_id": 1,
-  "position_embedding_type": "absolute",
-  "transformers_version": "4.2.1",
-  "type_vocab_size": 1,
-  "use_cache": true,
-  "vocab_size": 250002
-}

unsup-simcse-xlm-roberta-base/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

unsup-simcse-xlm-roberta-base/trainer_state.json DELETED Viewed

@@ -1,40 +0,0 @@
-{
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.9999040030719017,
-  "global_step": 1953,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.26,
-      "learning_rate": 7.43983614951357e-06,
-      "loss": 0.1187,
-      "step": 500
-    },
-    {
-      "epoch": 0.51,
-      "learning_rate": 4.8796722990271386e-06,
-      "loss": 0.0011,
-      "step": 1000
-    },
-    {
-      "epoch": 0.77,
-      "learning_rate": 2.319508448540707e-06,
-      "loss": 0.0006,
-      "step": 1500
-    },
-    {
-      "epoch": 1.0,
-      "step": 1953,
-      "train_runtime": 2937.1499,
-      "train_samples_per_second": 0.665
-    }
-  ],
-  "max_steps": 1953,
-  "num_train_epochs": 1,
-  "total_flos": 262568325479694336,
-  "trial_name": null,
-  "trial_params": null
-}