Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 26, 2024

Commit

78e7cbb

verified ·

1 Parent(s): 9f22f0a

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -75

app.py CHANGED Viewed

@@ -41,12 +41,21 @@ text_model = AutoModelForCausalLM.from_pretrained(
     quantization_config=quantization_config
 )
-vision_model = AutoModelForCausalLM.from_pretrained(
-    VISION_MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype="auto",
-    attn_implementation="flash_attention_2"
-).to(device).eval()
 vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
@@ -55,80 +64,84 @@ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 # Helper functions
-# Helper functions
-@spaces.GPU
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20):
-    conversation = [{"role": "system", "content": system_prompt}]
-    for prompt, answer in history:
-        conversation.extend([
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": answer},
-        ])
-    conversation.append({"role": "user", "content": message})
-    input_ids = text_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(text_model.device)
-    attention_mask = torch.ones_like(input_ids)  # Create attention mask
-    streamer = TextIteratorStreamer(text_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        attention_mask=attention_mask,  # Pass attention mask
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature > 0,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        eos_token_id=[128001, 128008, 128009],
-        streamer=streamer,
-    )
-    with torch.no_grad():
-        thread = Thread(target=text_model.generate, kwargs=generate_kwargs)
-        thread.start()
-    buffer = ""
-    audio_buffer = np.array([])
-    for new_text in streamer:
-        buffer += new_text
-        # Generate speech for the new text
-        tts_input_ids = tts_tokenizer(new_text, return_tensors="pt").input_ids.to(device)
-        tts_description = "A clear and natural voice reads the text with moderate speed and expression."
-        tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
         with torch.no_grad():
-            audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
-        new_audio = audio_generation.cpu().numpy().squeeze()
-        audio_buffer = np.concatenate((audio_buffer, new_audio))
-        yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
-@spaces.GPU
-def process_vision_query(image, text_input):
-    prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
-    # Ensure the image is in the correct format
-    if isinstance(image, np.ndarray):
-        # Convert numpy array to PIL Image
-        image = Image.fromarray(image).convert("RGB")
-    elif not isinstance(image, Image.Image):
-        raise ValueError("Invalid image type. Expected PIL.Image.Image or numpy.ndarray")
-    # Now process the image
-    inputs = vision_processor(prompt, images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        generate_ids = vision_model.generate(
-            **inputs,
-            max_new_tokens=1000,
-            eos_token_id=vision_processor.tokenizer.eos_token_id
-        )
-    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
-    response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    return response
 # Custom CSS
 custom_css = """
@@ -206,6 +219,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot, audio_output])
         clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():
             with gr.Column(scale=1):

     quantization_config=quantization_config
 )
+try:
+    vision_model = AutoModelForCausalLM.from_pretrained(
+        VISION_MODEL_ID,
+        trust_remote_code=True,
+        torch_dtype="auto",
+        attn_implementation="flash_attention_2"
+    ).to(device).eval()
+except Exception as e:
+    print(f"Error loading model with flash attention: {e}")
+    print("Falling back to default attention implementation")
+    vision_model = AutoModelForCausalLM.from_pretrained(
+        VISION_MODEL_ID,
+        trust_remote_code=True,
+        torch_dtype="auto"
+    ).to(device).eval()
 vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
 tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 # Helper functions
+@spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20):
+    try:
+        conversation = [{"role": "system", "content": system_prompt}]
+        for prompt, answer in history:
+            conversation.extend([
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": answer},
+            ])
+        conversation.append({"role": "user", "content": message})
+        input_ids = text_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(text_model.device)
+        attention_mask = torch.ones_like(input_ids)
+        streamer = TextIteratorStreamer(text_tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            do_sample=temperature > 0,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            eos_token_id=[128001, 128008, 128009],
+            streamer=streamer,
+        )
         with torch.no_grad():
+            thread = Thread(target=text_model.generate, kwargs=generate_kwargs)
+            thread.start()
+        buffer = ""
+        audio_buffer = np.array([])
+        for new_text in streamer:
+            buffer += new_text
+            # Generate speech for the new text
+            tts_input_ids = tts_tokenizer(new_text, return_tensors="pt").input_ids.to(device)
+            tts_description = "A clear and natural voice reads the text with moderate speed and expression."
+            tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
+            with torch.no_grad():
+                audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
+            new_audio = audio_generation.cpu().numpy().squeeze()
+            audio_buffer = np.concatenate((audio_buffer, new_audio))
+            yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        yield history + [[message, f"An error occurred: {str(e)}"]], None
+@spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
+def process_vision_query(image, text_input):
+    try:
+        prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
+        # Ensure the image is in the correct format
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image).convert("RGB")
+        elif not isinstance(image, Image.Image):
+            raise ValueError("Invalid image type. Expected PIL.Image.Image or numpy.ndarray")
+        inputs = vision_processor(prompt, images=image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            generate_ids = vision_model.generate(
+                **inputs,
+                max_new_tokens=1000,
+                eos_token_id=vision_processor.tokenizer.eos_token_id
+            )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return response
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return f"An error occurred: {str(e)}"
 # Custom CSS
 custom_css = """
         submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot, audio_output])
         clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():
             with gr.Column(scale=1):