Spaces:
Sleeping
Sleeping
Commit
·
efcab75
1
Parent(s):
3688b19
Remove vision features, focus on voice and response quality
Browse files- app/ora_server.py +10 -55
app/ora_server.py
CHANGED
|
@@ -29,10 +29,8 @@ model = None
|
|
| 29 |
tokenizer = None
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
| 32 |
-
# Advanced AI Models
|
| 33 |
whisper_model = None
|
| 34 |
-
vision_model = None
|
| 35 |
-
vision_processor = None
|
| 36 |
emotion_classifier = None
|
| 37 |
|
| 38 |
class ChatRequest(BaseModel):
|
|
@@ -78,39 +76,31 @@ async def load_model():
|
|
| 78 |
|
| 79 |
@app.on_event("startup")
|
| 80 |
async def load_advanced_ai():
|
| 81 |
-
global whisper_model,
|
| 82 |
|
| 83 |
try:
|
| 84 |
-
print("Loading
|
| 85 |
-
from transformers import pipeline
|
| 86 |
|
| 87 |
-
# Whisper V3 for Speech-to-Text
|
| 88 |
-
print("Loading Whisper V3...")
|
| 89 |
whisper_model = pipeline(
|
| 90 |
"automatic-speech-recognition",
|
| 91 |
model="openai/whisper-large-v3",
|
| 92 |
device=0 if device == "cuda" else -1
|
| 93 |
)
|
| 94 |
-
print("✓ Whisper V3 loaded")
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
print("Loading Moondream2 Vision...")
|
| 98 |
-
vision_model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", trust_remote_code=True)
|
| 99 |
-
vision_processor = AutoProcessor.from_pretrained("vikhyatk/moondream2")
|
| 100 |
-
if device == "cuda":
|
| 101 |
-
vision_model = vision_model.to("cuda")
|
| 102 |
-
print("✓ Moondream2 loaded")
|
| 103 |
-
|
| 104 |
-
# Emotion Detection
|
| 105 |
print("Loading Emotion Detector...")
|
| 106 |
emotion_classifier = pipeline(
|
| 107 |
"text-classification",
|
| 108 |
model="j-hartmann/emotion-english-distilroberta-base",
|
| 109 |
device=0 if device == "cuda" else -1
|
| 110 |
)
|
| 111 |
-
print("✓ Emotion Detector loaded")
|
| 112 |
|
| 113 |
-
print("
|
| 114 |
|
| 115 |
except Exception as e:
|
| 116 |
print(f"Warning: Could not load some AI models: {e}")
|
|
@@ -180,41 +170,6 @@ async def transcribe_audio(req: TranscribeRequest):
|
|
| 180 |
except Exception as e:
|
| 181 |
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
| 182 |
|
| 183 |
-
class VisionRequest(BaseModel):
|
| 184 |
-
image_data: str # Base64 encoded image
|
| 185 |
-
question: str = "What spiritual meaning does this image convey?"
|
| 186 |
-
|
| 187 |
-
@app.post("/api/analyze-image")
|
| 188 |
-
async def analyze_image(req: VisionRequest):
|
| 189 |
-
global vision_model, vision_processor
|
| 190 |
-
|
| 191 |
-
if vision_model is None or vision_processor is None:
|
| 192 |
-
raise HTTPException(status_code=503, detail="Vision model not loaded")
|
| 193 |
-
|
| 194 |
-
try:
|
| 195 |
-
import base64
|
| 196 |
-
from PIL import Image
|
| 197 |
-
import io
|
| 198 |
-
|
| 199 |
-
# Decode base64 image
|
| 200 |
-
image_bytes = base64.b64decode(req.image_data)
|
| 201 |
-
image = Image.open(io.BytesIO(image_bytes))
|
| 202 |
-
|
| 203 |
-
# Process with Moondream2
|
| 204 |
-
inputs = vision_processor(images=image, text=req.question, return_tensors="pt")
|
| 205 |
-
if device == "cuda":
|
| 206 |
-
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 207 |
-
|
| 208 |
-
with torch.no_grad():
|
| 209 |
-
output = vision_model.generate(**inputs, max_new_tokens=256)
|
| 210 |
-
|
| 211 |
-
analysis = vision_processor.decode(output[0], skip_special_tokens=True)
|
| 212 |
-
|
| 213 |
-
return {"analysis": analysis}
|
| 214 |
-
|
| 215 |
-
except Exception as e:
|
| 216 |
-
raise HTTPException(status_code=500, detail=f"Vision analysis failed: {str(e)}")
|
| 217 |
-
|
| 218 |
class EmotionRequest(BaseModel):
|
| 219 |
text: str
|
| 220 |
|
|
|
|
| 29 |
tokenizer = None
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
|
| 32 |
+
# Advanced AI Models (Voice & Response Quality)
|
| 33 |
whisper_model = None
|
|
|
|
|
|
|
| 34 |
emotion_classifier = None
|
| 35 |
|
| 36 |
class ChatRequest(BaseModel):
|
|
|
|
| 76 |
|
| 77 |
@app.on_event("startup")
|
| 78 |
async def load_advanced_ai():
|
| 79 |
+
global whisper_model, emotion_classifier
|
| 80 |
|
| 81 |
try:
|
| 82 |
+
print("Loading Voice & Response Quality AI...")
|
| 83 |
+
from transformers import pipeline
|
| 84 |
|
| 85 |
+
# Whisper V3 for Speech-to-Text (Professional quality)
|
| 86 |
+
print("Loading Whisper V3 STT...")
|
| 87 |
whisper_model = pipeline(
|
| 88 |
"automatic-speech-recognition",
|
| 89 |
model="openai/whisper-large-v3",
|
| 90 |
device=0 if device == "cuda" else -1
|
| 91 |
)
|
| 92 |
+
print("✓ Whisper V3 loaded - Professional STT ready")
|
| 93 |
|
| 94 |
+
# Emotion Detection for Compassionate Responses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
print("Loading Emotion Detector...")
|
| 96 |
emotion_classifier = pipeline(
|
| 97 |
"text-classification",
|
| 98 |
model="j-hartmann/emotion-english-distilroberta-base",
|
| 99 |
device=0 if device == "cuda" else -1
|
| 100 |
)
|
| 101 |
+
print("✓ Emotion Detector loaded - Empathetic responses enabled")
|
| 102 |
|
| 103 |
+
print("Voice & Response Quality AI Ready!")
|
| 104 |
|
| 105 |
except Exception as e:
|
| 106 |
print(f"Warning: Could not load some AI models: {e}")
|
|
|
|
| 170 |
except Exception as e:
|
| 171 |
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
class EmotionRequest(BaseModel):
|
| 174 |
text: str
|
| 175 |
|