Abdalkaderdev commited on
Commit
5306cf5
·
1 Parent(s): b062b38

Add Whisper V3, Moondream2, and Emotion Detection with API endpoints

Browse files
Files changed (1) hide show
  1. app/ora_server.py +129 -0
app/ora_server.py CHANGED
@@ -29,6 +29,12 @@ model = None
29
  tokenizer = None
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
 
 
 
 
 
 
32
  class ChatRequest(BaseModel):
33
  message: str
34
  history: list = []
@@ -70,6 +76,46 @@ async def load_model():
70
 
71
  print("ORA Model Connected and Ready.")
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  @app.post("/api/chat")
74
  async def chat_endpoint(req: ChatRequest):
75
  global model, tokenizer
@@ -107,6 +153,89 @@ async def chat_endpoint(req: ChatRequest):
107
 
108
  return {"response": response_text}
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
  # TTS endpoint using Supertonic 2
 
29
  tokenizer = None
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
32
+ # Advanced AI Models
33
+ whisper_model = None
34
+ vision_model = None
35
+ vision_processor = None
36
+ emotion_classifier = None
37
+
38
  class ChatRequest(BaseModel):
39
  message: str
40
  history: list = []
 
76
 
77
  print("ORA Model Connected and Ready.")
78
 
79
+ @app.on_event("startup")
80
+ async def load_advanced_ai():
81
+ global whisper_model, vision_model, vision_processor, emotion_classifier
82
+
83
+ try:
84
+ print("Loading Advanced AI Models...")
85
+ from transformers import pipeline, AutoModelForCausalLM, AutoProcessor
86
+
87
+ # Whisper V3 for Speech-to-Text
88
+ print("Loading Whisper V3...")
89
+ whisper_model = pipeline(
90
+ "automatic-speech-recognition",
91
+ model="openai/whisper-large-v3",
92
+ device=0 if device == "cuda" else -1
93
+ )
94
+ print("✓ Whisper V3 loaded")
95
+
96
+ # Moondream2 for Vision
97
+ print("Loading Moondream2 Vision...")
98
+ vision_model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
99
+ vision_processor = AutoProcessor.from_pretrained("vikhyatk/moondream2")
100
+ if device == "cuda":
101
+ vision_model = vision_model.to("cuda")
102
+ print("✓ Moondream2 loaded")
103
+
104
+ # Emotion Detection
105
+ print("Loading Emotion Detector...")
106
+ emotion_classifier = pipeline(
107
+ "text-classification",
108
+ model="j-hartmann/emotion-english-distilroberta-base",
109
+ device=0 if device == "cuda" else -1
110
+ )
111
+ print("✓ Emotion Detector loaded")
112
+
113
+ print("All Advanced AI Models Ready!")
114
+
115
+ except Exception as e:
116
+ print(f"Warning: Could not load some AI models: {e}")
117
+ print("ORA will continue with basic functionality.")
118
+
119
  @app.post("/api/chat")
120
  async def chat_endpoint(req: ChatRequest):
121
  global model, tokenizer
 
153
 
154
  return {"response": response_text}
155
 
156
+ # Advanced AI Endpoints
157
+
158
+ class TranscribeRequest(BaseModel):
159
+ audio_data: str # Base64 encoded audio
160
+
161
+ @app.post("/api/transcribe")
162
+ async def transcribe_audio(req: TranscribeRequest):
163
+ global whisper_model
164
+
165
+ if whisper_model is None:
166
+ raise HTTPException(status_code=503, detail="Whisper model not loaded")
167
+
168
+ try:
169
+ import base64
170
+ import io
171
+
172
+ # Decode base64 audio
173
+ audio_bytes = base64.b64decode(req.audio_data)
174
+
175
+ # Transcribe with Whisper
176
+ result = whisper_model(audio_bytes)
177
+
178
+ return {"text": result["text"], "confidence": 1.0}
179
+
180
+ except Exception as e:
181
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
182
+
183
+ class VisionRequest(BaseModel):
184
+ image_data: str # Base64 encoded image
185
+ question: str = "What spiritual meaning does this image convey?"
186
+
187
+ @app.post("/api/analyze-image")
188
+ async def analyze_image(req: VisionRequest):
189
+ global vision_model, vision_processor
190
+
191
+ if vision_model is None or vision_processor is None:
192
+ raise HTTPException(status_code=503, detail="Vision model not loaded")
193
+
194
+ try:
195
+ import base64
196
+ from PIL import Image
197
+ import io
198
+
199
+ # Decode base64 image
200
+ image_bytes = base64.b64decode(req.image_data)
201
+ image = Image.open(io.BytesIO(image_bytes))
202
+
203
+ # Process with Moondream2
204
+ inputs = vision_processor(images=image, text=req.question, return_tensors="pt")
205
+ if device == "cuda":
206
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
207
+
208
+ with torch.no_grad():
209
+ output = vision_model.generate(**inputs, max_new_tokens=256)
210
+
211
+ analysis = vision_processor.decode(output[0], skip_special_tokens=True)
212
+
213
+ return {"analysis": analysis}
214
+
215
+ except Exception as e:
216
+ raise HTTPException(status_code=500, detail=f"Vision analysis failed: {str(e)}")
217
+
218
+ class EmotionRequest(BaseModel):
219
+ text: str
220
+
221
+ @app.post("/api/detect-emotion")
222
+ async def detect_emotion(req: EmotionRequest):
223
+ global emotion_classifier
224
+
225
+ if emotion_classifier is None:
226
+ raise HTTPException(status_code=503, detail="Emotion model not loaded")
227
+
228
+ try:
229
+ result = emotion_classifier(req.text)[0]
230
+ return {
231
+ "emotion": result["label"],
232
+ "confidence": result["score"]
233
+ }
234
+
235
+ except Exception as e:
236
+ raise HTTPException(status_code=500, detail=f"Emotion detection failed: {str(e)}")
237
+
238
+
239
 
240
 
241
  # TTS endpoint using Supertonic 2