Spaces:

credent007
/

easyocr-phi3

Paused

App Files Files Community

credent007 commited on Apr 9

Commit

ed121f1

verified ·

1 Parent(s): 342762f

Update llm.py

Browse files

Files changed (1) hide show

llm.py +34 -40

llm.py CHANGED Viewed

@@ -2,55 +2,49 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from functools import partial
 import asyncio
 model_name = "microsoft/phi-3-mini-128k-instruct"
 # 8-bit quantization config
 quant_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    llm_int8_enable_fp32_cpu_offload=True # Helpful if GPU memory gets tight
 )
 # tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 # model
 model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    quantization_config=quant_config,
-    trust_remote_code=True
 )
-def execute_llm_sync(model, tokenizer, prompt: str):
-    """Synchronous function to run the actual inference."""
-    messages = [
-        {"role": "user", "content": prompt}
-    ]
-    # Prep inputs and move to the same device as the model (usually GPU)
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt"
-    ).to(model.device)
-    # Generate
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=2048 # 10,000 is very high; Phi-3 works better with smaller chunks
-        )
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 async def call_llm(prompt: str):
-    """Asynchronous wrapper to prevent blocking the main thread."""
-    loop = asyncio.get_running_loop()
-    # We use partial to pass arguments to the synchronous function
-    func = partial(execute_llm_sync, model, tokenizer, prompt)
-    # Run the heavy computation in a background thread
-    result = await loop.run_in_executor(None, func)
-    return result

 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from functools import partial
 import asyncio
 model_name = "microsoft/phi-3-mini-128k-instruct"
 # 8-bit quantization config
 quant_config = BitsAndBytesConfig(
+    load_in_8bit=True
 )
 # tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name,local_files_only=True)
 # model
 model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    quantization_config=quant_config,
+    local_files_only=True,
+    trust_remote_code=True
 )
+# Mock LLM function (replace with your actual LLM API call)
 async def call_llm(prompt: str):
+    # Simulate LLM call in executor
+    llm_function_with_args=partial(execute_llm(model,tokenizer,prompt))
+    loop = asyncio.get_event_loop()
+    result = await loop.run_in_executor(None, llm_function_with_args)
+    return result
+async def execute_llm(model,tokenizer,prompt:str):
+    prompt="what is json give an example "
+    data=""
+    full_prompt=prompt+" "+data
+    messages = [
+        {"role": "user", "content":full_prompt }
+    ]
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+        ).to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=500
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)