webpluging

Paused

App Files Files Community

ranamhamoud commited on Apr 18, 2024

Commit

4656d45

verified ·

1 Parent(s): e174867

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -52

app.py CHANGED Viewed

@@ -29,26 +29,19 @@ this demo is governed by the original [license](https://huggingface.co/spaces/hu
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if torch.cuda.is_available():
-    model_id = "meta-llama/Llama-2-7b-hf"
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=False,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)
-    storytell_model = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
-    storytell_tokenizer = AutoTokenizer.from_pretrained(model_id)
-    storytell_tokenizer.pad_token = storytell_tokenizer.eos_token
-    editing_model_id = "meta-llama/Llama-2-7b-chat-hf"
-    editing_model = AutoModelForCausalLM.from_pretrained(editing_model_id, torch_dtype=torch.float16, device_map="auto")
-    editing_tokenizer = AutoTokenizer.from_pretrained(model_id)
-    editing_tokenizer.use_default_system_prompt = False
 # MongoDB Connection
 PASSWORD = os.environ.get("MONGO_PASS")
 connect(host=f"mongodb+srv://ranamhammoud11:{PASSWORD}@stories.zf5v52a.mongodb.net/")
@@ -69,10 +62,9 @@ def process_text(text):
     return text
 @spaces.GPU
 def generate(
-    model_choice: str,
     message: str,
     chat_history: list[tuple[str, str]],
     max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
@@ -81,38 +73,19 @@ def generate(
     top_k: int = 20,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
-    if chat_history is None:
-        chat_history = []
     conversation = []
-    if model_choice == "Storytell":
-        model = storytell_model
-        tokenizer = storytell_tokenizer
-    else:
-        model = editing_model
-        tokenizer = editing_tokenizer
-    # Checking each tuple in chat_history to ensure it has exactly two elements
-    for item in chat_history:
-        if isinstance(item, tuple) and len(item) == 2:
-            user, assistant = item
-            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-        else:
-            print(f"Error in chat history item: {item}. Each item must be a tuple with exactly two elements.")
-            continue  # Skip this item or handle appropriately
-    # Append the current user message
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
@@ -127,9 +100,10 @@ def generate(
     outputs = []
     for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
     final_story = "".join(outputs)
     try:
@@ -142,7 +116,6 @@ def generate(
 chat_interface = gr.ChatInterface(
     fn=generate,
     stop_btn=None,
-    additional_inputs=[gr.Dropdown(["Storytell", "HF Meta Llama 7b Chat"], label="Choose Model")],
     examples=[
         ["Can you explain briefly to me what is the Python programming language?"],
         ["Could you please provide an explanation about the concept of recursion?"],

 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+# Model and Tokenizer Configuration
+model_id = "meta-llama/Llama-2-7b-hf"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=False,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)
+model = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = tokenizer.eos_token
 # MongoDB Connection
 PASSWORD = os.environ.get("MONGO_PASS")
 connect(host=f"mongodb+srv://ranamhammoud11:{PASSWORD}@stories.zf5v52a.mongodb.net/")
     return text
+# Gradio Function
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
     max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
     top_k: int = 20,
     repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
     conversation = []
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": make_prompt(message)})
+    enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
+    input_ids = enc.input_ids.to(model.device)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
     generate_kwargs = dict(
+        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
     outputs = []
     for text in streamer:
+        processed_text = process_text(text)
+        outputs.append(processed_text)
+        output = "".join(outputs)
+        yield output
     final_story = "".join(outputs)
     try:
 chat_interface = gr.ChatInterface(
     fn=generate,
     stop_btn=None,
     examples=[
         ["Can you explain briefly to me what is the Python programming language?"],
         ["Could you please provide an explanation about the concept of recursion?"],