Spaces:

SFM2001
/

SimpleAES

Paused

App Files Files Community

SFM2001 commited on Jun 17, 2025

Commit

48cf773

1 Parent(s): 75fb515

accelerate

Browse files

Files changed (1) hide show

inference/infer_single.py +18 -10

inference/infer_single.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 from torch.cuda.amp import autocast
 from create_app import *
 from transformers import GenerationConfig
 def replace_single_newlines(text):
     return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
@@ -20,19 +21,12 @@ def generate_full_prompt(topic, essay, cefr_stat):
 def generate_and_score_essay(topic, essay):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
     cefr_results = get_cefr_stats(essay)
     full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
     essay = replace_single_newlines(essay)
     paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
-    gen_config = GenerationConfig(
-        max_new_tokens=850,     # cut way down from 1500
-        do_sample=True,
-        top_k=50,
-        top_p=0.9,
-        temperature=0.7,
-        eos_token_id=QWEN_TOKENIZER.eos_token_id,
-        pad_token_id=QWEN_TOKENIZER.eos_token_id,
-        )
     text = QWEN_TOKENIZER.apply_chat_template(
                 [{"role": "user", "content": full_prompt}],
                 tokenize=False,
@@ -46,13 +40,25 @@ def generate_and_score_essay(topic, essay):
         truncation=True,
         padding_side='left'
     ).to(device)
-    with torch.no_grad():
         outputs = QWEN_MODEL.generate(
             **inputs,
             generation_config=gen_config,
             use_cache=True,
             return_dict_in_generate=False,
         )
     generated_ids = outputs[0][inputs.input_ids.shape[1]:]
     full_feedback = QWEN_TOKENIZER.decode(
         generated_ids,
@@ -78,6 +84,7 @@ def generate_and_score_essay(topic, essay):
     'paragraph_count': feedback_components.get('paragraph_count', ''),
     'cefr_stat': feedback_components.get('cefr_stat', '')
     })
     score_inputs = LONGFORMER_TOKENIZER(
         score_input,
         return_tensors="pt",
@@ -90,4 +97,5 @@ def generate_and_score_essay(topic, essay):
         outputs = LONGFORMER_MODEL(**score_inputs)  # Get full outputs dictionary
         scores = outputs['logits'].cpu().numpy()
     scores = [round(x) for x in scores[0]]
     return scores, feedback_components

 from torch.cuda.amp import autocast
 from create_app import *
 from transformers import GenerationConfig
+import time
 def replace_single_newlines(text):
     return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
 def generate_and_score_essay(topic, essay):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
+    print("Analysing CEFR")
     cefr_results = get_cefr_stats(essay)
     full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
+    print("Generating prompt")
     essay = replace_single_newlines(essay)
     paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
     text = QWEN_TOKENIZER.apply_chat_template(
                 [{"role": "user", "content": full_prompt}],
                 tokenize=False,
         truncation=True,
         padding_side='left'
     ).to(device)
+    print("Tokenized")
+    start = time.time()
+    gen_config = GenerationConfig(
+        max_new_tokens=850,     # cut way down from 1500
+        do_sample=True,
+        top_k=20,
+        top_p=0.9,
+        temperature=0.7,
+        eos_token_id=QWEN_TOKENIZER.eos_token_id,
+        pad_token_id=QWEN_TOKENIZER.eos_token_id,
+        )
+    with torch.inference_mode():
         outputs = QWEN_MODEL.generate(
             **inputs,
             generation_config=gen_config,
             use_cache=True,
             return_dict_in_generate=False,
         )
+    print("Generated", time.time() - start)
     generated_ids = outputs[0][inputs.input_ids.shape[1]:]
     full_feedback = QWEN_TOKENIZER.decode(
         generated_ids,
     'paragraph_count': feedback_components.get('paragraph_count', ''),
     'cefr_stat': feedback_components.get('cefr_stat', '')
     })
+    print("input got")
     score_inputs = LONGFORMER_TOKENIZER(
         score_input,
         return_tensors="pt",
         outputs = LONGFORMER_MODEL(**score_inputs)  # Get full outputs dictionary
         scores = outputs['logits'].cpu().numpy()
     scores = [round(x) for x in scores[0]]
+    print("Score got")
     return scores, feedback_components