EnariGmbH
/

surftown-1.0

@@ -5,8 +5,8 @@ from peft import PeftModel
 class EndpointHandler:
     def __init__(self):
-        self.base_model_name = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-        self.adapter_model_name = "EnariGmbH/surftown-1.0"
         # Load the base model
         self.model = LlavaNextVideoForConditionalGeneration.from_pretrained(
@@ -21,6 +21,9 @@ class EndpointHandler:
         # Merge the adapter weights into the base model and unload the adapter
         self.model = self.model.merge_and_unload()
         # # Optionally, load and save the processor (if needed)
         self.processor = LlavaNextVideoProcessor.from_pretrained(self.adapter_model_name)
@@ -30,28 +33,58 @@ class EndpointHandler:
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Args:
-            data (Dict): Contains the input data including "clip" and "prompt".
         Returns:
             List[Dict[str, Any]]: The generated text from the model.
         """
         # Extract inputs from the data dictionary
         clip = data.get("clip")
-        prompt = data.get("prompt")
         if clip is None or prompt is None:
             return [{"error": "Missing 'clip' or 'prompt' in input data"}]
         # Prepare the inputs for the model
         inputs_video = self.processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(self.model.device)
         # Generate output from the model
         generate_kwargs = {"max_new_tokens": 512, "do_sample": True, "top_p": 0.9}
         output = self.model.generate(**inputs_video, **generate_kwargs)
         generated_text = self.processor.batch_decode(output, skip_special_tokens=True)
         # Extract the relevant part of the assistant's answer
         assistant_answer_start = generated_text[0].find("ASSISTANT:") + len("ASSISTANT:")
         assistant_answer = generated_text[0][assistant_answer_start:].strip()
-        return [{"generated_text": assistant_answer}]

 class EndpointHandler:
     def __init__(self):
+        self.base_model_name = "llava-hf/LLaVA-NeXT-Video-7B-hf"  # Replace with the original base model ID
+        self.adapter_model_name = "EnariGmbH/surftown-1.0"  # Your fine-tuned adapter model ID
         # Load the base model
         self.model = LlavaNextVideoForConditionalGeneration.from_pretrained(
         # Merge the adapter weights into the base model and unload the adapter
         self.model = self.model.merge_and_unload()
+        # # Save the full model
+        # model.save_pretrained("surftown_fine_tuned_prompt_0")
         # # Optionally, load and save the processor (if needed)
         self.processor = LlavaNextVideoProcessor.from_pretrained(self.adapter_model_name)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Args:
+            data (Dict): Contains the input data including "clip"
         Returns:
             List[Dict[str, Any]]: The generated text from the model.
         """
         # Extract inputs from the data dictionary
         clip = data.get("clip")
+        prompt = """
+        You are a surfing coach specialized on perfecting surfer's pop-up move. Please analyze the surfer's pop-up move in detail from the video.
+                    In your detailed analysis you should always mention: Wave Position and paddling, Pushing Phase, Transition, Reaching Phase and finnaly Balance and Control.
+                    At the end of your answer you must provide suggestions on how the surfer can improve in the next pop-up.
+                    Never mention your name in the answer and keep the answers short and direct.
+                    Your answers should ALWAYS follow this structure:
+                        Description: \n
+                            Wave Position and paddling: .\n.
+                            Pushing Phase: \n.
+                            Transition: \n.
+                            Reaching Phase: \n
+                            Balance and Control: \n\n\n
+                        Summary: \n
+                            Suggestions for improvement:\n
+                            """
+        # Define a conversation history for surfing pop-up move analysis
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "video"},
+                ],
+            },
+        ]
+        # Apply the chat template to create the prompt for the model
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
         if clip is None or prompt is None:
             return [{"error": "Missing 'clip' or 'prompt' in input data"}]
         # Prepare the inputs for the model
         inputs_video = self.processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(self.model.device)
         # Generate output from the model
         generate_kwargs = {"max_new_tokens": 512, "do_sample": True, "top_p": 0.9}
         output = self.model.generate(**inputs_video, **generate_kwargs)
         generated_text = self.processor.batch_decode(output, skip_special_tokens=True)
         # Extract the relevant part of the assistant's answer
         assistant_answer_start = generated_text[0].find("ASSISTANT:") + len("ASSISTANT:")
         assistant_answer = generated_text[0][assistant_answer_start:].strip()
+        return [{"generated_text": assistant_answer}]