SFM2001 commited on
Commit
48cf773
·
1 Parent(s): 75fb515

accelerate

Browse files
Files changed (1) hide show
  1. inference/infer_single.py +18 -10
inference/infer_single.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  from torch.cuda.amp import autocast
5
  from create_app import *
6
  from transformers import GenerationConfig
 
7
 
8
  def replace_single_newlines(text):
9
  return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
@@ -20,19 +21,12 @@ def generate_full_prompt(topic, essay, cefr_stat):
20
  def generate_and_score_essay(topic, essay):
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
 
23
  cefr_results = get_cefr_stats(essay)
24
  full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
 
25
  essay = replace_single_newlines(essay)
26
  paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
27
- gen_config = GenerationConfig(
28
- max_new_tokens=850, # cut way down from 1500
29
- do_sample=True,
30
- top_k=50,
31
- top_p=0.9,
32
- temperature=0.7,
33
- eos_token_id=QWEN_TOKENIZER.eos_token_id,
34
- pad_token_id=QWEN_TOKENIZER.eos_token_id,
35
- )
36
  text = QWEN_TOKENIZER.apply_chat_template(
37
  [{"role": "user", "content": full_prompt}],
38
  tokenize=False,
@@ -46,13 +40,25 @@ def generate_and_score_essay(topic, essay):
46
  truncation=True,
47
  padding_side='left'
48
  ).to(device)
49
- with torch.no_grad():
 
 
 
 
 
 
 
 
 
 
 
50
  outputs = QWEN_MODEL.generate(
51
  **inputs,
52
  generation_config=gen_config,
53
  use_cache=True,
54
  return_dict_in_generate=False,
55
  )
 
56
  generated_ids = outputs[0][inputs.input_ids.shape[1]:]
57
  full_feedback = QWEN_TOKENIZER.decode(
58
  generated_ids,
@@ -78,6 +84,7 @@ def generate_and_score_essay(topic, essay):
78
  'paragraph_count': feedback_components.get('paragraph_count', ''),
79
  'cefr_stat': feedback_components.get('cefr_stat', '')
80
  })
 
81
  score_inputs = LONGFORMER_TOKENIZER(
82
  score_input,
83
  return_tensors="pt",
@@ -90,4 +97,5 @@ def generate_and_score_essay(topic, essay):
90
  outputs = LONGFORMER_MODEL(**score_inputs) # Get full outputs dictionary
91
  scores = outputs['logits'].cpu().numpy()
92
  scores = [round(x) for x in scores[0]]
 
93
  return scores, feedback_components
 
4
  from torch.cuda.amp import autocast
5
  from create_app import *
6
  from transformers import GenerationConfig
7
+ import time
8
 
9
  def replace_single_newlines(text):
10
  return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
 
21
  def generate_and_score_essay(topic, essay):
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
  global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
24
+ print("Analysing CEFR")
25
  cefr_results = get_cefr_stats(essay)
26
  full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
27
+ print("Generating prompt")
28
  essay = replace_single_newlines(essay)
29
  paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
 
 
 
 
 
 
 
 
 
30
  text = QWEN_TOKENIZER.apply_chat_template(
31
  [{"role": "user", "content": full_prompt}],
32
  tokenize=False,
 
40
  truncation=True,
41
  padding_side='left'
42
  ).to(device)
43
+ print("Tokenized")
44
+ start = time.time()
45
+ gen_config = GenerationConfig(
46
+ max_new_tokens=850, # cut way down from 1500
47
+ do_sample=True,
48
+ top_k=20,
49
+ top_p=0.9,
50
+ temperature=0.7,
51
+ eos_token_id=QWEN_TOKENIZER.eos_token_id,
52
+ pad_token_id=QWEN_TOKENIZER.eos_token_id,
53
+ )
54
+ with torch.inference_mode():
55
  outputs = QWEN_MODEL.generate(
56
  **inputs,
57
  generation_config=gen_config,
58
  use_cache=True,
59
  return_dict_in_generate=False,
60
  )
61
+ print("Generated", time.time() - start)
62
  generated_ids = outputs[0][inputs.input_ids.shape[1]:]
63
  full_feedback = QWEN_TOKENIZER.decode(
64
  generated_ids,
 
84
  'paragraph_count': feedback_components.get('paragraph_count', ''),
85
  'cefr_stat': feedback_components.get('cefr_stat', '')
86
  })
87
+ print("input got")
88
  score_inputs = LONGFORMER_TOKENIZER(
89
  score_input,
90
  return_tensors="pt",
 
97
  outputs = LONGFORMER_MODEL(**score_inputs) # Get full outputs dictionary
98
  scores = outputs['logits'].cpu().numpy()
99
  scores = [round(x) for x in scores[0]]
100
+ print("Score got")
101
  return scores, feedback_components