Spaces:

malvika2003
/

INTEL

Runtime error

App Files Files Community

malvika2003 commited on Jul 2, 2024

Commit

0d5912e

verified ·

1 Parent(s): af25edf

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -96

app.py CHANGED Viewed

@@ -1,67 +1,38 @@
 import os
-from transformers import AutoTokenizer, AutoConfig
 from optimum.intel.openvino import OVModelForCausalLM
-from generation_utils import run_generation, estimate_latency, reset_textbox,get_special_token_id
 from config import SUPPORTED_LLM_MODELS
 import gradio as gr
 from threading import Thread
 from time import perf_counter
 from typing import List
-from transformers import AutoTokenizer, TextIteratorStreamer
 import numpy as np
-import os
-from flask import Flask, render_template, redirect, url_for, request, flash
-from flask_sqlalchemy import SQLAlchemy
-from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
-from werkzeug.security import generate_password_hash, check_password_hash
-app = Flask(__name__)
-if __name__ == '__main__':
-    app.run(debug=True)
-model_dir = "C:/phi-2/INT8_compressed_weights"
-print(f"Checking model directory: {model_dir}")
-print(f"Contents: {os.listdir(model_dir)}")  # Check contents of the directory
-print(f"Loading model from {model_dir}")
 model_name = "susnato/phi-2"
 model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
 ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
-tok = AutoTokenizer.from_pretrained(model_name)
 ov_model = OVModelForCausalLM.from_pretrained(
     model_dir,
     device="CPU",
     ov_config=ov_config,
 )
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
-# Continue with your tokenizer usage
 response_key = model_configuration.get("response_key")
 tokenizer_response_key = None
 def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
-    """
-    Gets the token ID for a given string that has been added to the tokenizer as a special token.
-    Args:
-        tokenizer (PreTrainedTokenizer): the tokenizer
-        key (str): the key to convert to a single token
-    Raises:
-        ValueError: if more than one ID was generated
-    Returns:
-        int: the token ID for the given key
-    """
     token_ids = tokenizer.encode(key)
     if len(token_ids) > 1:
         raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
     return token_ids[0]
 if response_key is not None:
     tokenizer_response_key = next(
         (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
@@ -73,8 +44,7 @@ if tokenizer_response_key:
     try:
         end_key = model_configuration.get("end_key")
         if end_key:
-            end_key_token_id =get_special_token_id(tokenizer, end_key)
-        # Ensure generation stops once it generates "### End"
     except ValueError:
         pass
@@ -89,20 +59,6 @@ def estimate_latency(
     per_token_time: List[float],
     num_tokens: int,
 ):
-    """
-    Helper function for performance estimation
-    Parameters:
-      current_time (float): This step time in seconds.
-      current_perf_text (str): Current content of performance UI field.
-      new_gen_text (str): New generated text.
-      per_token_time (List[float]): history of performance from previous steps.
-      num_tokens (int): Total number of generated tokens.
-    Returns:
-      update for performance text field
-      update for a total number of tokens
-    """
     num_current_toks = len(tokenizer.encode(new_gen_text))
     num_tokens += num_current_toks
     per_token_time.append(num_current_toks / current_time)
@@ -113,6 +69,7 @@ def estimate_latency(
             num_tokens,
         )
     return current_perf_text, num_tokens
 def run_generation(
     user_text: str,
     top_p: float,
@@ -121,29 +78,8 @@ def run_generation(
     max_new_tokens: int,
     perf_text: str,
 ):
-    """
-    Text generation function
-    Parameters:
-      user_text (str): User-provided instruction for a generation.
-      top_p (float):  Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
-      temperature (float): The value used to module the logits distribution.
-      top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
-      max_new_tokens (int): Maximum length of generated sequence.
-      perf_text (str): Content of text field for printing performance results.
-    Returns:
-      model_output (str) - model-generated text
-      perf_text (str) - updated perf text filed content
-    """
-    # Prepare input prompt according to model expected template
     prompt_text = prompt_template.format(instruction=user_text)
-    # Tokenize the user text.
     model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
-    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
-    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
@@ -158,8 +94,6 @@ def run_generation(
     )
     t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
     t.start()
-    # Pull the generated text from the streamer, and update the model output.
     model_output = ""
     per_token_time = []
     num_tokens = 0
@@ -171,22 +105,10 @@ def run_generation(
         yield model_output, perf_text
         start = perf_counter()
     return model_output, perf_text
-def reset_textbox(instruction: str, response: str, perf: str):
-    """
-    Helper function for resetting content of all text fields
-    Parameters:
-      instruction (str): Content of user instruction field.
-      response (str): Content of model response field.
-      perf (str): Content of performance info filed
-    Returns:
-      empty string for each placeholder
-    """
     return "", "", ""
 examples = [
     "Give me a recipe for pizza with pineapple",
     "Write me a tweet about the new OpenVINO release",
@@ -269,12 +191,12 @@ def main():
             [user_text, model_output, performance],
         )
-    if __name__ == "__main__":
-        demo.queue()
-        try:
-            demo.launch(height=800)
-        except Exception:
-            demo.launch(share=True, height=800)
-# Call main function to start Gradio interface
-main()

 import os
+from transformers import AutoTokenizer
 from optimum.intel.openvino import OVModelForCausalLM
+from generation_utils import run_generation, estimate_latency, reset_textbox, get_special_token_id
 from config import SUPPORTED_LLM_MODELS
 import gradio as gr
 from threading import Thread
 from time import perf_counter
 from typing import List
+from transformers import TextIteratorStreamer
 import numpy as np
+# Model configuration and loading
+model_dir = "C:/Users/KIIT/OneDrive/Desktop/INTEL/phi-2/INT8_compressed_weights"
 model_name = "susnato/phi-2"
 model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
 ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 ov_model = OVModelForCausalLM.from_pretrained(
     model_dir,
     device="CPU",
     ov_config=ov_config,
 )
 tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
 response_key = model_configuration.get("response_key")
 tokenizer_response_key = None
 def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
     token_ids = tokenizer.encode(key)
     if len(token_ids) > 1:
         raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
     return token_ids[0]
 if response_key is not None:
     tokenizer_response_key = next(
         (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
     try:
         end_key = model_configuration.get("end_key")
         if end_key:
+            end_key_token_id = get_special_token_id(tokenizer, end_key)
     except ValueError:
         pass
     per_token_time: List[float],
     num_tokens: int,
 ):
     num_current_toks = len(tokenizer.encode(new_gen_text))
     num_tokens += num_current_toks
     per_token_time.append(num_current_toks / current_time)
             num_tokens,
         )
     return current_perf_text, num_tokens
 def run_generation(
     user_text: str,
     top_p: float,
     max_new_tokens: int,
     perf_text: str,
 ):
     prompt_text = prompt_template.format(instruction=user_text)
     model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
     )
     t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
     t.start()
     model_output = ""
     per_token_time = []
     num_tokens = 0
         yield model_output, perf_text
         start = perf_counter()
     return model_output, perf_text
+def reset_textbox(instruction: str, response: str, perf: str):
     return "", "", ""
 examples = [
     "Give me a recipe for pizza with pineapple",
     "Write me a tweet about the new OpenVINO release",
             [user_text, model_output, performance],
         )
+    demo.queue()
+    try:
+        demo.launch(height=800)
+    except Exception:
+        demo.launch(share=True, height=800)
+if __name__ == "__main__":
+    main()