Spaces:

SFM2001
/

SimpleAES

Paused

App Files Files Community

SFM2001 commited on Jun 16, 2025

Commit

8a2aebb

1 Parent(s): b92108d

try optimize

Browse files

Files changed (8) hide show

__pycache__/create_app.cpython-312.pyc +0 -0
create_app.py +1 -1
inference/__pycache__/infer_single.cpython-312.pyc +0 -0
inference/infer_single.py +17 -9
instance/users.db +0 -0
requirements.txt +2 -1
views/__pycache__/auth.cpython-312.pyc +0 -0
views/__pycache__/infer.cpython-312.pyc +0 -0

__pycache__/create_app.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/create_app.cpython-312.pyc and b/__pycache__/create_app.cpython-312.pyc differ

create_app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def load_models():
         model_name = 'Qwen/Qwen3-1.7B'
         QWEN_TOKENIZER = AutoTokenizer.from_pretrained(model_name, device='auto')
         QWEN_TOKENIZER.pad_token_id = QWEN_TOKENIZER.eos_token_id
-        QWEN_MODEL = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).half()
         QWEN_MODEL = QWEN_MODEL.to(device)
         MODELS_LOADED = True

         model_name = 'Qwen/Qwen3-1.7B'
         QWEN_TOKENIZER = AutoTokenizer.from_pretrained(model_name, device='auto')
         QWEN_TOKENIZER.pad_token_id = QWEN_TOKENIZER.eos_token_id
+        QWEN_MODEL = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", load_in_8bit=True, torch_dtype=torch.float16).half()
         QWEN_MODEL = QWEN_MODEL.to(device)
         MODELS_LOADED = True

inference/__pycache__/infer_single.cpython-312.pyc CHANGED Viewed

Binary files a/inference/__pycache__/infer_single.cpython-312.pyc and b/inference/__pycache__/infer_single.cpython-312.pyc differ

inference/infer_single.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from utils.data_utils import *
 from utils.prompts import *
 import torch
 from create_app import *
 def replace_single_newlines(text):
     return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
@@ -16,15 +18,21 @@ def generate_full_prompt(topic, essay, cefr_stat):
 def generate_and_score_essay(topic, essay):
-    global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    LONGFORMER_MODEL = LONGFORMER_MODEL.to(device)
-    QWEN_MODEL = QWEN_MODEL.to(device)
     cefr_results = get_cefr_stats(essay)
     full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
     essay = replace_single_newlines(essay)
     paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
     text = QWEN_TOKENIZER.apply_chat_template(
                 [{"role": "user", "content": full_prompt}],
                 tokenize=False,
@@ -38,12 +46,12 @@ def generate_and_score_essay(topic, essay):
         truncation=True,
         padding_side='left'
     ).to(device)
-    with torch.inference_mode():
         outputs = QWEN_MODEL.generate(
             **inputs,
-            max_new_tokens=1500,
-            use_cache=True,
-            pad_token_id=QWEN_TOKENIZER.eos_token_id
         )
     generated_ids = outputs[0][inputs.input_ids.shape[1]:]
     full_feedback = QWEN_TOKENIZER.decode(
@@ -78,7 +86,7 @@ def generate_and_score_essay(topic, essay):
         padding=True
     ).to(device)
     LONGFORMER_MODEL.eval()
-    with torch.no_grad():
         outputs = LONGFORMER_MODEL(**score_inputs)  # Get full outputs dictionary
         scores = outputs['logits'].cpu().numpy()
     scores = [round(x) for x in scores[0]]

 from utils.data_utils import *
 from utils.prompts import *
 import torch
+from torch.cuda.amp import autocast
 from create_app import *
+from transformers import GenerationConfig
 def replace_single_newlines(text):
     return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
 def generate_and_score_essay(topic, essay):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    global MODELS_LOADED, LONGFORMER_TOKENIZER, LONGFORMER_MODEL, QWEN_TOKENIZER, QWEN_MODEL
     cefr_results = get_cefr_stats(essay)
     full_prompt = generate_full_prompt(topic=topic, essay=essay, cefr_stat=cefr_results)
     essay = replace_single_newlines(essay)
     paragraph_cnt = len(essay.replace('\\n\\n', '\\n').split('\\n'))
+    gen_config = GenerationConfig(
+        max_new_tokens=512,     # cut way down from 1500
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        temperature=0.7,
+        eos_token_id=QWEN_TOKENIZER.eos_token_id,
+        pad_token_id=QWEN_TOKENIZER.eos_token_id,
+        )
     text = QWEN_TOKENIZER.apply_chat_template(
                 [{"role": "user", "content": full_prompt}],
                 tokenize=False,
         truncation=True,
         padding_side='left'
     ).to(device)
+    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16):
         outputs = QWEN_MODEL.generate(
             **inputs,
+            generation_config=gen_config,
+            use_cache=True,
+            return_dict_in_generate=False,
         )
     generated_ids = outputs[0][inputs.input_ids.shape[1]:]
     full_feedback = QWEN_TOKENIZER.decode(
         padding=True
     ).to(device)
     LONGFORMER_MODEL.eval()
+    with torch.no_grad(), autocast(device_type='cuda', dtype=torch.float16):
         outputs = LONGFORMER_MODEL(**score_inputs)  # Get full outputs dictionary
         scores = outputs['logits'].cpu().numpy()
     scores = [round(x) for x in scores[0]]

instance/users.db CHANGED Viewed

Binary files a/instance/users.db and b/instance/users.db differ

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ flask==3.1.1
 flask_login==0.6.3
 werkzeug==3.1.3
 flask_sqlalchemy==3.1.1
-gunicorn

 flask_login==0.6.3
 werkzeug==3.1.3
 flask_sqlalchemy==3.1.1
+gunicorn
+bitsandbytes-0.42.0

views/__pycache__/auth.cpython-312.pyc CHANGED Viewed

Binary files a/views/__pycache__/auth.cpython-312.pyc and b/views/__pycache__/auth.cpython-312.pyc differ

views/__pycache__/infer.cpython-312.pyc CHANGED Viewed

Binary files a/views/__pycache__/infer.cpython-312.pyc and b/views/__pycache__/infer.cpython-312.pyc differ