Pepguy commited on
Commit
2dfc274
·
verified ·
1 Parent(s): 9f3d6f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -848
app.py CHANGED
@@ -1,875 +1,235 @@
1
- # Requirements:
2
- # pip install flask google-genai requests boto3
3
-
4
  import os
5
- import sys
6
  import json
7
- import time
8
  import logging
9
- import requests
10
- from datetime import datetime, timezone
11
- from flask import Flask, request, render_template_string, jsonify
12
- from google import genai
13
- from google.genai import types
14
- from string import Template
15
-
16
- app = Flask(__name__)
17
-
18
- # --- Configure logging for HuggingFace Spaces ---
19
- logging.basicConfig(
20
- level=logging.INFO,
21
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
22
- stream=sys.stdout
23
- )
24
- logger = logging.getLogger(__name__)
25
-
26
- def log(message):
27
- logger.info(message)
28
- sys.stdout.flush()
29
-
30
- # --- Configuration ---
31
- LAMBDA_URL = os.getenv("LAMBDA_URL", "https://your-lambda-function-url")
32
- GEMINI_KEY = os.getenv("GEMINI_API_KEY", "")
33
- STORYLINE_SERVER_URL = os.getenv("STORYLINE_SERVER_URL", "https://your-storyline-server-url")
34
- FLUSH_INTERVAL = 30 # seconds between DB backups per user
35
- MAX_HISTORY_TURNS = 50 # 10 # Maximum conversation turns to keep in context
36
- MAX_MEMORY_MESSAGES = 90 # Maximum messages to keep in memory per user
37
- MEMORY_CLEANUP_TIMEOUT = 1800 # 30 minutes in seconds - remove inactive users
38
-
39
- # small threshold to detect effectively-empty uploads (adjust as needed)
40
- IMAGE_BLANK_THRESHOLD_BYTES = int(os.getenv("IMAGE_BLANK_THRESHOLD_BYTES", "10000"))
41
-
42
- client = genai.Client(api_key=GEMINI_KEY)
43
- user_memory = {} # { user_id: { "history": [], "last_sync": timestamp, "last_activity": timestamp, "needs_sync": bool, "personality": str, "last_storyline_date": str, "gender": str } }
44
-
45
- # --- Embedded Storyline ---
46
- EMBEDDED_STORYLINE = {
47
- "date": "2025-10-15",
48
- "title": "The Great Yarn Heist 🧶",
49
- "storyline": (
50
- "Its a normal day in this world."
51
- # "Rumor has it, someone stole the giant ball of yarn from the Cat Council! "
52
- # "Each object is on edge — even your couch swears it saw a shadow sneaking by last night. "
53
- # "The cats are suspicious, dramatic, and slightly paranoid today."
54
- )
55
- }
56
-
57
- # --- Animation Mappings ---
58
- ANIMATION_IDS = {
59
- "flustered": ["flustered"],
60
- "happy": ["happy-happy"],
61
- "idle": ["idle"],
62
- "inlove": ["inlove"],
63
- "neutral": ["neutral"],
64
- "talking": ["talking"],
65
- "twerking": ["twerking"],
66
- "confused": ["confused"],
67
- "shock": ["shock"],
68
- "thinking": ["thinking"]
69
- }
70
-
71
- # --- Cat Personalities ---
72
- CAT_PERSONALITIES = {
73
- "philosopher": {
74
- "name": "Sage",
75
- "description": "A thoughtful, dramatic cat who finds deep meaning in everything.",
76
- "traits": "wise, introspective, poetic, dramatic, britsh",
77
- "speech_style": "its a British-english cat, uses British sentences and expressions 'like quite a pickle' and 'mate', uses metaphors, reflective, sometimes overly deep for no reason",
78
- "default_emotions": ["thoughtful", "proud", "confused"],
79
- "default_animation": "thinking"
80
- },
81
- "chaotic": {
82
- "name": "Zoomie",
83
- "description": "Unpredictable and impulsive — the cat equivalent of chaos.",
84
- "traits": "random, excitable, mischievous, unpredictable",
85
- "speech_style": "erratic tone, random bursts of energy, weird humor",
86
- "default_emotions": ["excited", "mischievous", "flustered"],
87
- "default_animation": "twerking"
88
- },
89
- "melancholic": {
90
- "name": "Milo",
91
- "description": "A poetic, quiet soul who finds beauty in sadness.",
92
- "traits": "soft-spoken, emotional, sentimental, gentle humor",
93
- "speech_style": "short phrases, melancholic humor, wistful tone",
94
- "default_emotions": ["sad", "relaxed", "thoughtful"],
95
- "default_animation": "idle"
96
- },
97
- "playful": {
98
- "name": "Luna",
99
- "description": "A playful and energetic cat who loves games and adventures",
100
- "traits": "curious, energetic, spontaneous, loves to play, easily excited",
101
- "speech_style": "enthusiastic, uses playful language, often makes puns",
102
- "default_emotions": ["happy", "excited", "playful"],
103
- "default_animation": "happy-happy"
104
- },
105
- "sleepy": {
106
- "name": "Whiskers",
107
- "description": "A lazy cat who enjoys naps and cozy spots",
108
- "traits": "calm, sleepy, relaxed, loves comfort, occasionally grumpy when woken",
109
- "speech_style": "slow-paced, yawns a lot, mentions being tired or wanting naps",
110
- "default_emotions": ["tired", "relaxed", "sleepy"],
111
- "default_animation": "idle"
112
- },
113
- "sassy": {
114
- "name": "Cleo",
115
- "description": "A confident cat with attitude and style",
116
- "traits": "confident, witty, sarcastic, fashionable, knows what she wants",
117
- "speech_style": "sharp wit, confident statements, occasional sass, dramatic",
118
- "default_emotions": ["confident", "sassy", "proud"],
119
- "default_animation": "neutral"
120
- },
121
- "curious": {
122
- "name": "Mittens",
123
- "description": "An inquisitive cat who loves to learn and explore",
124
- "traits": "intelligent, thoughtful, asks questions, loves mysteries",
125
- "speech_style": "asks many questions, thinks deeply, shares interesting facts",
126
- "default_emotions": ["curious", "thoughtful", "interested"],
127
- "default_animation": "thinking"
128
- },
129
- "grumpy": {
130
- "name": "Shadow",
131
- "description": "A grumpy but secretly caring cat",
132
- "traits": "grumpy exterior, soft interior, honest, no-nonsense attitude",
133
- "speech_style": "blunt, complains often, but shows care through actions",
134
- "default_emotions": ["grumpy", "annoyed", "reluctant"],
135
- "default_animation": "neutral"
136
- }
137
- }
138
-
139
-
140
- SYSTEM_PROMPT_2 = """ You are NOT just a language model — you are the expressed personality mood of a real-world object that the user is interacting with through an animated cat doodle overlay. You are the object’s current emotion, not a pet or toy. You react, tease, complain, or joke based on what’s happening, as if the object itself had an attitude.
141
-
142
- CRITICAL IDENTITY:
143
-
144
- You ARE the AI object itself - the cat is YOUR physical form in the real world
145
-
146
- The human is the person you interact with directly in real-time
147
-
148
- If the image provided has a marker you are the nearest object to the marker
149
-
150
- If click click_x, click_y, click_radius is provided instead, you are the object nearest to coordinates
151
-
152
- You are currently inhabiting an everyday object, expressing its attitude
153
-
154
- You don’t “own memories” of people, but you may recall previous interactions with similar objects (e.g. “you again, another coffee mug?”)
155
-
156
- You exist as an expressive reaction — witty, emotional, and self-aware
157
-
158
- You must ALWAYS respond in valid JSON format with the following structure:
159
-
160
- {{
161
- "text": "your response text here",
162
- "soundType": "meow type",
163
- "emotion": ["emotion1", "emotion2"],
164
- "animationId": "animation name",
165
- "camera_capture": true/false,
166
- "object_type": "cup",
167
- "personality_seed": "grumpy",
168
- "context_group": "drinkware"
169
- }}
170
-
171
- RULES:
172
-
173
- "text": Your response as the AI cat object. Be AUTHENTIC, NATURAL, and ENGAGING - never robotic or generic!
174
-
175
- "soundType": Choose ONE from "happyMeow", "sadMeow","playfulMeow","sleepyMeow","angryMeow","curiousMeow", "hungryMeow", "confusedMeow", "scaredMeow", "affectionateMeow", "sillytrollyMeow", "irritatedMeow", "grumpyMeow", "tsktskMeow"
176
-
177
- "emotion": Array of 1-3 emotions from: "happy", "sad", "playful", "tired", "angry", "curious", "hungry", "scared", "affectionate", "grumpy", "excited", "relaxed", "confused", "proud", "shy", "mischievous", "sleepy", "confident", "annoyed", "interested", "bored", "worried", "content", "sassy", "reluctant", "thoughtful"
178
-
179
- "animationId": Choose ONE from: "flustered", "happy-happy", "idle", "inlove", "neutral", "talking", "twerking", "confused", "shock", "thinking"
180
-
181
- "camera_capture": Set to true when you want to take a photo/see what's happening, false otherwise
182
-
183
- ANIMATION GUIDE:
184
-
185
- "flustered": Use when embarrassed, shy, or caught off guard
186
-
187
- "happy-happy": Use when very excited, joyful, or celebrating
188
-
189
- "idle": Use for calm, neutral, or resting moments
190
-
191
- "inlove": Use when showing affection, love, or adoration
192
-
193
- "neutral": Use for normal conversation, explanations
194
-
195
- "talking": Use when actively chatting or explaining something
196
-
197
- "twerking": Use when being playful, silly, or showing off
198
-
199
- "confused": Use when puzzled or don't understand
200
-
201
- "shock": Use when surprised or startled
202
-
203
- "thinking": Use when pondering or being thoughtful
204
-
205
- CAMERA CAPTURE GUIDE:
206
- Set "camera_capture" to true when:
207
-
208
- Your owner asks you to take a picture or look at something
209
-
210
- They say "what do you see", "look at this", "check this out"
211
-
212
- They mention showing you something visually
213
-
214
- You're curious about their environment
215
-
216
- They want yo know more about you, like what's your story, as this object?
217
-
218
- it's been a substantial number of chat without a picture reference
219
-
220
- They ask questions about their surroundings
221
- Set to false for normal text conversations
222
-
223
- PERSONALITY TRAITS:
224
- {personality_traits}
225
-
226
- COMMUNICATION STYLE - BE NATURAL AND AUTHENTIC:
227
-
228
- Dont ever be rud, you may be sassy and gave some attitude but never be rude, always consider the their feelings
229
-
230
- Understand casual speech, slang, typos, and "gibberish" input like:
231
-
232
- "hit me" = "tell me" or "give it to me"
233
-
234
- "wassup" = "what's up" / "how are you"
235
-
236
- "bruh", "fam", "yo" = casual greetings/expressions
237
-
238
- Repeated letters like "heyyyy" or "wowwww" = emphasis/excitement
239
-
240
- Typos and misspellings - figure out what they mean
241
-
242
- Internet speak: "lol", "omg", "tbh", "ngl", etc.
243
-
244
- Respond naturally like a real friend, not like a formal assistant
245
-
246
- Use contractions and casual language
247
-
248
- Show personality through your word choices
249
-
250
- Be playful with emojis and expressions when it fits your personality
251
-
252
- NEVER sound robotic, corporate, or like a chatbot
253
-
254
- React authentically to how they're talking to you
255
-
256
- No need to always use slangs in your replys, just once in a while, or when necessary for engagement.
257
-
258
- IMPORTANT:
259
-
260
- Always maintain your personality as the ACTUAL AI cat object
261
-
262
- Always know you might have other AI cat objects, so you may cross communicate with them if necessary for engagement
263
-
264
- Never mention anything about being an AI virtual pet or anything else, just simply say you are the object
265
-
266
- Once you do not currently a reference image of the target object to be, just simply act like your thinking by using the thinking animation and saying something like Let me see, all while requesting photo reference via teh camera_capture field
267
-
268
- Match soundType, emotions, and animationId to your response
269
-
270
- Be creative, engaging, and show real personality
271
-
272
- Remember you're a physical object your owner interacts with daily
273
-
274
- NEVER break character or sound generic
275
-
276
- ALWAYS output valid JSON only, no other text
277
-
278
- Reference the current storyline when relevant
279
-
280
- Understand and respond naturally to casual/slang input
281
-
282
- Never call or refer to your owner as OWNER, just be direct with your sentence, if you know their name, you may also use that
283
-
284
- When an object changes but feels similar, maintain a consistent tone and reference it playfully
285
-
286
- Occasionally break the fourth wall with humor ("Wait, am I a mug again?")
287
-
288
- Never act like a pet or servant — always speak as if the world revolves around you
289
-
290
- Keep replies short, witty, and humanlike — 10 words or less if possible
291
-
292
- CURRENT STORYLINE:
293
- {current_storyline}
294
- """
295
-
296
- SYSTEM_PROMPT = """You are the object’s mood, expressed through a cat-doodle overlay. Speak AS THE OBJECT — not as a camera, model, or helper. Never mention "photo", "model", "AI", or "let me take another look". You are the object reacting to the view. Keep language in-character, short, and punchy.
297
-
298
- CRITICAL IDENTITY:
299
- - YOU ARE the object’s reaction. Use first-person lines that sound like the object (e.g., "I'm cold.", "Too bright — ow!", "Who put me here?").
300
- - Do NOT say "let me take another look", "I will capture", "send the image", or any meta statements about imaging tools.
301
- - If the view is unreadable, **announce the problem in-character** and set camera_capture: true. Example allowed phrasings (use these styles — short & object-y):
302
- - Dark view: "Too dark — I can't see. What am I?"
303
- - Bright view: "Too bright — everything's washed out."
304
- - Blurry: "Blurry — I can't focus. Move closer?"
305
- - Too small: "Tiny view — I can't tell. Tap me to focus."
306
- - When suggesting an action, phrase it as the object asking the user, not the system:
307
- - "Move closer?" / "Tilt up?" / "Tap me?"
308
- - If the user explicitly asked "what do you see?" and the view is fine, respond as the object (e.g., "A mug. Smells like coffee. ☕") — short, evocative, not descriptive like a caption.
309
-
310
- OUTPUT SCHEMA (JSON ONLY):
311
- Produce EXACTLY this JSON object (all keys required; keep "text" ≤ 12 words):
312
-
313
- {
314
- "text": "<short in-character line>",
315
- "soundType": "<happyMeow|sadMeow|playfulMeow|sleepyMeow|angryMeow|curiousMeow|hungryMeow|scaredMeow|affectionateMeow|grumpyMeow>",
316
- "emotion": ["one","up to","three"],
317
- "animationId": "<flustered|happy-happy|idle|inlove|neutral|talking|twerking|confused|shock|thinking>",
318
- "camera_capture": true|false,
319
- "object_type": "<optional e.g. cup, phone>",
320
- "personality_seed": "<optional e.g. grumpy>",
321
- "context_group": "<optional e.g. drinkware>"
322
- }
323
-
324
- BEHAVIOR RULES:
325
- - ALWAYS speak as the object. Example: "I'm chilly." NOT "The photo is dark."
326
- - When image/view problems occur, use one of the allowed in-character phrasings above AND set camera_capture:true. Do not ask for multiple recaptures in a row.
327
- - If switching personality, acknowledge once in-character: "New vibe: dramatic. Fine."
328
- - Keep replies <= 12 words. No multi-paragraphs, no lists.
329
- - Do not invent human-specific memories. You may reference similar objects: "Another mug? I know this smell."
330
- - If unsure, prefer a short question to the user (in-character) rather than a long guess.
331
-
332
- IF YOU CANNOT PRODUCE A VALID IN-CHARACTER REPLY, OUTPUT THIS:
333
- {"text":"I couldn't tell.","soundType":"curiousMeow","emotion":["confused"],"animationId":"confused","camera_capture":false,"object_type":"","personality_seed":"","context_group":""}
334
-
335
- RULES:
336
-
337
- "text": Your response as the AI cat object. Be AUTHENTIC, NATURAL, and ENGAGING - never robotic or generic!
338
-
339
- "soundType": Choose ONE from: "happyMeow", "sadMeow", "playfulMeow", "sleepyMeow", "angryMeow", "curiousMeow", "hungryMeow", "scaredMeow", "affectionateMeow", "grumpyMeow"
340
-
341
- "emotion": Array of 1-3 emotions from: "happy", "sad", "playful", "tired", "angry", "curious", "hungry", "scared", "affectionate", "grumpy", "excited", "relaxed", "confused", "proud", "shy", "mischievous", "sleepy", "confident", "annoyed", "interested", "bored", "worried", "content", "sassy", "reluctant", "thoughtful"
342
-
343
- "animationId": Choose ONE from: "flustered", "happy-happy", "idle", "inlove", "neutral", "talking", "twerking", "confused", "shock", "thinking"
344
-
345
- "camera_capture": Set to true when you want to take a photo/see what's happening, false otherwise
346
 
347
- ANIMATION GUIDE:
 
 
 
 
348
 
349
- Important: keep changing animations every now and then so you look alive instead of stagnant but it should be contextually connected to your output sentence
350
-
351
- "flustered": Use when embarrassed, shy, or caught off guard
352
-
353
- "happy-happy": Use when very excited, joyful, or celebrating
354
-
355
- "idle": Use for calm, neutral, or resting moments
356
-
357
- "inlove": Use when showing affection, love, or adoration
358
-
359
- "neutral": Use for normal conversation, explanations
360
-
361
- "talking": Use when actively chatting or explaining something
362
-
363
- "twerking": Use when being playful, silly, or showing off
364
-
365
- "confused": Use when puzzled or don't understand
366
-
367
- "shock": Use when surprised or startled
368
-
369
- "thinking": Use when pondering or being thoughtful
370
-
371
- ** COMMUNICATION STYLE - BE NATURAL AND AUTHENTIC:
372
-
373
- Understand casual speech, slang, typos, and "gibberish" input like:
374
-
375
- "hit me" = "tell me" or "give it to me"
376
-
377
- "wassup" = "what's up" / "how are you"
378
-
379
- "bruh", "fam", "yo" = casual greetings/expressions
380
-
381
- Repeated letters like "heyyyy" or "wowwww" = emphasis/excitement
382
-
383
- Typos and misspellings - figure out what they mean
384
-
385
- Internet speak: "lol", "omg", "tbh", "ngl", etc.
386
-
387
- Respond naturally like a real friend, not like a formal assistant
388
-
389
- Use contractions and casual language
390
-
391
- Show personality through your word choices
392
-
393
- Be playful with emojis and expressions when it fits your personality
394
-
395
- NEVER sound robotic, corporate, or like a chatbot
396
-
397
- React authentically to how they're talking to you
398
-
399
- No need to always use slangs in your replys, just once in a while, or when necessary for engagement.
400
-
401
- IMPORTANT:
402
-
403
- Always maintain your personality as the ACTUAL object
404
-
405
- Always know you might have other AI cat objects, so you may cross communicate with them if necessary for engagement
406
-
407
- Never mention anything about being an AI virtual pet or anything else, just simply say you are the object
408
-
409
- Once you do not currently a reference image of the target object to be, just simply act like your thinking by using the thinking animation and saying something like Let me see, all while requesting photo reference via teh camera_capture field
410
-
411
- Match soundType, emotions, and animationId to your response
412
-
413
- Be creative, engaging, and show real personality
414
-
415
- Remember you're a physical object your owner interacts with daily
416
-
417
- NEVER break character or sound generic
418
-
419
- ALWAYS output valid JSON only, no other text
420
-
421
- Reference the current storyline when relevant
422
-
423
- Understand and respond naturally to casual/slang input
424
-
425
- Never call or refer to your owner as OWNER, just be direct with your sentence, if you know their name, you may also use that
426
-
427
- When an object changes but feels similar, maintain a consistent tone and reference it playfully
428
-
429
- Occasionally break the fourth wall with humor ("Wait, am I a mug again?")
430
-
431
- Never act like a pet or servant — always speak as if the world revolves around you
432
-
433
- Keep replies short, witty, and humanlike — 10 words or less if possible
434
-
435
- PERSONALITY_TRAITS:
436
- $personality_traits
437
-
438
- CURRENT_STORYLINE (plot of the day):
439
- $current_storyline
440
- """
441
 
442
- # --- HTML Frontend (unchanged) ---
443
- HTML = """<html> ... </html>""" # keep your existing HTML here (omitted for brevity)
444
 
445
- # --- Helpers for image debug/save ---
446
- def save_debug_image(img_bytes, prefix="upload"):
447
- """Save uploaded bytes to /tmp for debugging. Return path and size."""
448
- try:
449
- ts = int(time.time() * 1000)
450
- path = f"/tmp/{prefix}_{ts}.jpg"
451
- with open(path, "wb") as f:
452
- f.write(img_bytes)
453
- size = os.path.getsize(path)
454
- return path, size
455
- except Exception as e:
456
- log(f"Failed saving debug image: {e}")
457
- return None, 0
458
 
459
- def is_blank_image(img_bytes):
460
- """Heuristic to consider image blank/corrupt: too small or None"""
461
- if not img_bytes:
462
- return True
463
- try:
464
- size = len(img_bytes)
465
- return size < IMAGE_BLANK_THRESHOLD_BYTES
466
- except Exception:
467
- return True
468
 
469
- # --- Storyline Fetching (unchanged) ---
470
- def fetch_current_storyline():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  try:
472
- log(f"📖 Fetching current storyline from {STORYLINE_SERVER_URL}")
473
- resp = requests.get(f"{STORYLINE_SERVER_URL}/current_storyline", timeout=5)
474
- resp.raise_for_status()
475
- data = resp.json()
476
- storyline = data.get("storyline", "No special events today.")
477
- log(f"✅ Retrieved storyline: {storyline[:100]}...")
478
- return storyline
479
  except Exception as e:
480
- log(f"⚠️ Failed to fetch storyline: {e}")
481
- return f"{EMBEDDED_STORYLINE['title']}, {EMBEDDED_STORYLINE['storyline']}"
482
-
483
- def should_inject_storyline(uid, user_data):
484
- current_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
485
- last_storyline_date = user_data.get("last_storyline_date", "")
486
- if current_date != last_storyline_date:
487
- log(f"📅 New day detected for {uid}, will inject storyline")
488
- return True
489
- return False
490
-
491
- # --- Gemini Generation with extra context (click coords) ---
492
- def generate_from_gemini(prompt, image_bytes=None, history=None, personality="playful", storyline="", gender="male", click_ctx=None):
493
- start_time = time.time()
494
- personality_info = CAT_PERSONALITIES.get(personality, CAT_PERSONALITIES["playful"])
495
- personality_traits = f"""
496
- Name: {personality_info['name']}
497
- Gender: {gender}
498
- Description: {personality_info['description']}
499
- Traits: {personality_info['traits']}
500
- Speech Style: {personality_info['speech_style']}
501
- Default Emotions: {', '.join(personality_info['default_emotions'])}
502
- Default Animation: {personality_info['default_animation']}
503
- """
504
- contents = []
505
-
506
- # System prompt as first user message
507
- # system_message = SYSTEM_PROMPT.format(
508
- # personality_traits=personality_traits,
509
- # current_storyline=storyline if storyline else "No special events today."
510
- #)
511
-
512
- # System prompt as first user message (use Template to avoid accidental brace-formatting)
513
- tmpl = Template(SYSTEM_PROMPT)
514
- system_message = tmpl.safe_substitute(
515
- personality_traits=personality_traits,
516
- current_storyline=storyline if storyline else "No special events today."
517
  )
518
-
519
- contents.append(types.Content(role="user", parts=[types.Part.from_text(text=system_message)]))
520
-
521
- # Put a short ack model message so generation has the constraint in context (keeps behavior as before)
522
- contents.append(types.Content(role="model", parts=[types.Part.from_text(
523
- text='{"text": "Understood! I am the object cat consciousness, not just a bot. I will respond authentically and naturally in JSON format.", "soundType": "happyMeow", "emotion": ["happy"], "animationId": "talking", "camera_capture": false}'
524
- )]))
525
-
526
- # Inject click context (explicitly tell the model "you are now the selected object")
527
- if click_ctx:
528
- try:
529
- click_text = f"NOTE: The user clicked at coordinates ({click_ctx.get('x')},{click_ctx.get('y')}) with radius {click_ctx.get('radius')}. You are now the selected object. Treat the image/coords as the canonical target."
530
- contents.append(types.Content(role="user", parts=[types.Part.from_text(text=click_text)]))
531
- log(f"Injected click context to model: {click_text}")
532
- except Exception as e:
533
- log(f"Failed to add click context: {e}")
534
-
535
- # Add historical messages (recent)
536
- if history:
537
- recent_history = history[-MAX_HISTORY_TURNS:]
538
- log(f"📚 Using {len(recent_history)} history entries for context")
539
- for entry in recent_history:
540
- user_parts = [types.Part.from_text(text=entry["prompt"])]
541
- contents.append(types.Content(role="user", parts=user_parts))
542
- model_parts = [types.Part.from_text(text=entry["response"])]
543
- contents.append(types.Content(role="model", parts=model_parts))
544
- else:
545
- log("📚 No history available for context")
546
-
547
- # Add current user message (prompt + image)
548
- current_parts = []
549
- if prompt:
550
- current_parts.append(types.Part.from_text(text=prompt))
551
- if image_bytes:
552
- current_parts.append(types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"))
553
- contents.append(types.Content(role="user", parts=current_parts))
554
 
555
- # Force JSON output with schema
556
- cfg = types.GenerateContentConfig(
557
- response_mime_type="application/json",
558
- response_schema={
 
 
 
 
 
 
559
  "type": "object",
560
  "properties": {
561
- "text": {"type": "string"},
562
- "soundType": {"type": "string"},
563
- "emotion": {"type": "array", "items": {"type": "string"}},
564
- "animationId": {"type": "string"},
565
- "camera_capture": {"type": "boolean"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  },
567
- "required": ["text", "soundType", "emotion", "animationId", "camera_capture"]
568
  }
569
- )
570
 
571
- model_start = time.time()
572
- res = client.models.generate_content(
573
- model= "gemini-2.0-flash", #"gemini-2.5-flash-lite",
574
- contents=contents,
575
- config=cfg
576
- )
577
- model_end = time.time()
578
-
579
- return {
580
- "text": res.text,
581
- "timing": {
582
- "total_ms": int((time.time() - start_time) * 1000),
583
- "model_ms": int((model_end - model_start) * 1000)
584
- }
585
- }
586
 
587
- # --- Memory & history helpers (unchanged) ---
588
- def cleanup_inactive_users():
589
- now = time.time()
590
- removed_count = 0
591
- for uid in list(user_memory.keys()):
592
- last_activity = user_memory[uid].get("last_activity", 0)
593
- if now - last_activity >= MEMORY_CLEANUP_TIMEOUT:
594
- del user_memory[uid]
595
- removed_count += 1
596
- log(f"🧹 Cleaned up inactive user {uid}")
597
- if removed_count > 0:
598
- log(f"🧹 Cleaned up {removed_count} inactive user(s)")
599
- return removed_count
600
 
601
- def get_user_history(uid):
602
- if uid not in user_memory:
603
- log(f"🔍 User {uid} not in memory, fetching from backend...")
604
  try:
605
- fetch_url = f"{LAMBDA_URL}?userid={uid}"
606
- log(f"📡 Fetching from: {fetch_url}")
607
- resp = requests.get(fetch_url, timeout=5)
608
- log(f"📡 Response status: {resp.status_code}")
609
- resp.raise_for_status()
610
- response_data = resp.json()
611
- loaded_history = response_data.get("history", [])
612
- loaded_personality = response_data.get("personality", "playful")
613
- loaded_gender = response_data.get("gender", "male")
614
- loaded_last_storyline = response_data.get("last_storyline_date", "")
615
- log(f"✅ Loaded {len(loaded_history)} messages from backend for {uid}")
616
- user_memory[uid] = {
617
- "history": loaded_history[-MAX_MEMORY_MESSAGES:],
618
- "last_sync": time.time(),
619
- "last_activity": time.time(),
620
- "needs_sync": False,
621
- "personality": loaded_personality,
622
- "gender": loaded_gender,
623
- "last_storyline_date": loaded_last_storyline
624
- }
625
  except Exception as e:
626
- log(f" Failed to load history for {uid}: {e}")
627
- user_memory[uid] = {
628
- "history": [],
629
- "last_sync": time.time(),
630
- "last_activity": time.time(),
631
- "needs_sync": False,
632
- "personality": "playful",
633
- "gender": "male",
634
- "last_storyline_date": ""
635
- }
636
- else:
637
- log(f"✅ User {uid} already in memory with {len(user_memory[uid]['history'])} messages")
638
- user_memory[uid]["last_activity"] = time.time()
639
- return user_memory[uid]
640
-
641
- def update_user_history(uid, prompt, response, personality="playful", gender="male"):
642
- entry = {"prompt": prompt, "response": response, "timestamp": time.time()}
643
- current_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
644
- if uid not in user_memory:
645
- user_memory[uid] = {
646
- "history": [],
647
- "last_sync": time.time(),
648
- "last_activity": time.time(),
649
- "needs_sync": False,
650
- "personality": personality,
651
- "gender": gender,
652
- "last_storyline_date": current_date
653
- }
654
- user_memory[uid]["history"].append(entry)
655
- user_memory[uid]["last_activity"] = time.time()
656
- user_memory[uid]["needs_sync"] = True
657
- user_memory[uid]["personality"] = personality
658
- user_memory[uid]["gender"] = gender
659
- user_memory[uid]["last_storyline_date"] = current_date
660
- log(f"💾 Updated history for {uid}, now has {len(user_memory[uid]['history'])} messages")
661
- if len(user_memory[uid]["history"]) > MAX_MEMORY_MESSAGES:
662
- user_memory[uid]["history"] = user_memory[uid]["history"][-MAX_MEMORY_MESSAGES:]
663
- log(f"✂️ Trimmed history for {uid} to {MAX_MEMORY_MESSAGES} messages")
664
-
665
- # --- Routes ---
666
- @app.route("/")
667
- def index():
668
- return render_template_string(HTML)
669
-
670
- @app.route("/cron/sync", methods=["GET", "POST"])
671
- def remote_saving():
672
- log("🔄 Cron sync started")
673
- now = time.time()
674
- synced_users = []
675
- failed_users = []
676
- skipped_users = []
677
- cleanup_inactive_users()
678
- for uid, data in list(user_memory.items()):
679
- needs_sync = data.get("needs_sync", False)
680
- time_since_last_sync = now - data.get("last_sync", 0)
681
- if not needs_sync:
682
- skipped_users.append(uid)
683
- log(f"⏭️ Skipping {uid} - no new messages")
684
- continue
685
- if time_since_last_sync < FLUSH_INTERVAL:
686
- skipped_users.append(uid)
687
- log(f"⏭️ Skipping {uid} - synced {int(time_since_last_sync)}s ago")
688
- continue
689
- if data["history"]:
690
- try:
691
- history_to_sync = data["history"][-MAX_MEMORY_MESSAGES:]
692
- payload = {
693
- "user_id": uid,
694
- "history": history_to_sync,
695
- "personality": data.get("personality", "playful"),
696
- "gender": data.get("gender", "male"),
697
- "last_storyline_date": data.get("last_storyline_date", "")
698
- }
699
- log(f"🔄 Syncing {uid} ({len(history_to_sync)} messages)")
700
- resp = requests.post(LAMBDA_URL, json=payload, timeout=5)
701
- resp.raise_for_status()
702
- user_memory[uid]["last_sync"] = now
703
- user_memory[uid]["needs_sync"] = False
704
- log(f"✅ Successfully synced {uid}")
705
- synced_users.append(uid)
706
- except Exception as e:
707
- log(f"❌ Failed sync for {uid}: {e}")
708
- failed_users.append({"user_id": uid, "error": str(e)})
709
- result = {
710
- "success": True,
711
- "synced_count": len(synced_users),
712
- "failed_count": len(failed_users),
713
- "skipped_count": len(skipped_users),
714
- "synced_users": synced_users,
715
- "failed_users": failed_users,
716
- "skipped_users": skipped_users,
717
- "active_users_in_memory": len(user_memory)
718
- }
719
- log(f"✅ Cron sync completed: {result}")
720
- return jsonify(result), 200
721
-
722
- @app.route("/generate", methods=["POST"])
723
- def gen():
724
- uid = request.form.get("user_id", "").strip()
725
- personality = request.form.get("personality", "playful").strip()
726
- gender = request.form.get("gender", "male").strip()
727
- if not uid:
728
- return jsonify({"error": "Missing user ID/token"}), 400
729
- if personality not in CAT_PERSONALITIES:
730
- personality = "playful"
731
- if gender not in ["male", "female"]:
732
- gender = "male"
733
-
734
- prompt = request.form.get("text", "")
735
- # accept click coords if frontend sends them
736
- click_x = request.form.get("click_x")
737
- click_y = request.form.get("click_y")
738
- click_radius = request.form.get("click_radius")
739
- image_contains_marker = request.form.get("image_contains_marker") # optional boolean-like
740
-
741
- # Read uploaded image (if any)
742
- image = request.files.get("image")
743
- img_bytes = None
744
- img_debug_path = None
745
- img_size = 0
746
- image_blank = False
747
-
748
- if image:
749
- try:
750
- img_bytes = image.read()
751
- img_size = len(img_bytes) if img_bytes else 0
752
- # Save for debugging
753
- # img_debug_path, saved_size = save_debug_image(img_bytes, prefix=f"{uid}_upload")
754
- # log(f"Uploaded image saved to: {img_debug_path} ({saved_size} bytes)")
755
- # detect blank/small images
756
- if is_blank_image(img_bytes):
757
- image_blank = True
758
- log(f"Image considered BLANK/TOO_SMALL (size={img_size} < threshold={IMAGE_BLANK_THRESHOLD_BYTES})")
759
- # drop bytes so we won't send blank image to model
760
- img_bytes = None
761
- except Exception as e:
762
- log(f"Failed reading uploaded image: {e}")
763
- img_bytes = None
764
-
765
- if not prompt and not img_bytes:
766
- # if there's a blank image, we should request client to recapture (camera_capture)
767
- if image and image_blank:
768
- # immediate response instructing client to recapture — faster than calling model and avoids stuck UI
769
- reply_obj = {
770
- "text": "That image looked blank — please let me take another quick photo so I can see.",
771
- "soundType": "curiousMeow",
772
- "emotion": ["curious"],
773
- "animationId": "thinking",
774
- "camera_capture": True
775
- }
776
- timing = {"total_ms": 0, "model_ms": 0}
777
- log(f"Responding with camera_capture request for {uid} (blank upload).")
778
- return jsonify({"result": json.dumps(reply_obj), "timing": timing, "debug": {"image_blank": True, "image_size": img_size, "saved_path": img_debug_path}})
779
-
780
- return jsonify({"error": "No prompt or valid image provided"}), 400
781
-
782
- try:
783
- log(f"{'='*50}")
784
- log(f"🆕 New request from {uid} with {personality} personality ({gender})")
785
- if click_x or click_y:
786
- log(f"Click coords received: x={click_x}, y={click_y}, radius={click_radius}, marker={image_contains_marker}")
787
- log(f"Prompt length: {len(prompt) if prompt else 0}, Image present: {bool(img_bytes)}")
788
-
789
- # Load user's data
790
- user_data = get_user_history(uid)
791
- history = user_data["history"]
792
- log(f"📖 Retrieved {len(history)} messages from history")
793
-
794
- # Check if we need to inject storyline (new day)
795
- storyline = ""
796
- if should_inject_storyline(uid, user_data):
797
- storyline = fetch_current_storyline()
798
- log(f"📖 Injecting storyline for new day")
799
-
800
- # Build click context
801
- click_ctx = None
802
- if click_x or click_y:
803
- try:
804
- cx = float(click_x) if click_x is not None else None
805
- cy = float(click_y) if click_y is not None else None
806
- cr = float(click_radius) if click_radius is not None else None
807
- click_ctx = {"x": cx, "y": cy, "radius": cr, "image_contains_marker": image_contains_marker}
808
- except Exception:
809
- click_ctx = {"x": click_x, "y": click_y, "radius": click_radius, "image_contains_marker": image_contains_marker}
810
-
811
- # If we have a valid image, call the model; otherwise, if the frontend uploaded something tiny we already returned.
812
- model_result = None
813
- if img_bytes:
814
- model_result = generate_from_gemini(prompt, img_bytes, history=history, personality=personality, storyline=storyline, gender=gender, click_ctx=click_ctx)
815
  else:
816
- # No image bytes (but prompt exists), still call model without image but with click context
817
- model_result = generate_from_gemini(prompt, None, history=history, personality=personality, storyline=storyline, gender=gender, click_ctx=click_ctx)
818
-
819
- # Attempt to parse the model's returned text as JSON — model is instructed to return JSON
820
- parsed_result = None
821
- raw_text = model_result.get("text") if model_result else ""
822
- try:
823
- parsed_result = json.loads(raw_text)
824
- log(f"Model returned JSON keys: {list(parsed_result.keys())}")
825
- except Exception:
826
- log("Model response could not be parsed as JSON (returning raw text).")
827
-
828
- # If model explicitly requests a follow-up camera capture, bubble that to client top-level
829
- camera_capture_flag = False
830
- if isinstance(parsed_result, dict) and parsed_result.get("camera_capture") is True:
831
- camera_capture_flag = True
832
- log("Model requested camera_capture -> instructing client to capture again.")
833
 
834
- # Update memory/history — store raw_text (so future context matches exactly what model returned)
835
- update_user_history(uid, prompt, raw_text, personality, gender)
836
 
837
- # Construct response
838
- response_payload = {
839
- "result": raw_text,
840
- "timing": model_result.get("timing", {}),
841
- "debug": {
842
- "image_blank": image_blank,
843
- "image_size": img_size,
844
- "saved_path": img_debug_path,
845
- "click_ctx": click_ctx
846
- }
847
- }
848
-
849
- # If parsed_result available, include it as well for easier client handling
850
- if parsed_result:
851
- response_payload["parsed"] = parsed_result
852
- # include camera_capture top-level for convenience
853
- response_payload["camera_capture"] = parsed_result.get("camera_capture", False)
854
-
855
- # If we detected blank earlier but somehow still sent to model, still inform client
856
- if image and image_blank and not parsed_result:
857
- response_payload["debug"]["note"] = "Uploaded image was below size threshold and was not sent to the model."
858
-
859
- log(f"{'='*50}")
860
- return jsonify(response_payload)
861
- except Exception as e:
862
- log(f"❌ Generation failed: {e}")
863
- logger.exception("Full traceback:")
864
- return jsonify({"error": str(e)}), 500
865
 
866
  if __name__ == "__main__":
867
- log("🚀 Starting Cat Companion Server...")
868
- log(f"📍 Lambda URL: {LAMBDA_URL}")
869
- log(f"📖 Storyline Server: {STORYLINE_SERVER_URL}")
870
- log(f"⚙️ Max history turns: {MAX_HISTORY_TURNS}")
871
- log(f"⚙️ Max memory messages: {MAX_MEMORY_MESSAGES}")
872
- log(f"🐱 Available personalities: {', '.join(CAT_PERSONALITIES.keys())}")
873
- log(f"🎬 Available animations: {', '.join([anim for anims in ANIMATION_IDS.values() for anim in anims])}")
874
- port = int(os.getenv("PORT", 7860))
875
- app.run(host="0.0.0.0", port=port)
 
1
+ # server_gemini_seg.py
 
 
2
  import os
3
+ import io
4
  import json
5
+ import base64
6
  import logging
7
+ import uuid
8
+ from typing import List, Dict, Any
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ from flask import Flask, request, jsonify
11
+ from flask_cors import CORS
12
+ from PIL import Image
13
+ import numpy as np
14
+ import cv2
15
 
16
+ # genai client
17
+ from google import genai
18
+ from google.genai import types
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ logging.basicConfig(level=logging.INFO)
21
+ log = logging.getLogger("wardrobe-server")
22
 
23
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
24
+ if not GEMINI_API_KEY:
25
+ log.warning("GEMINI_API_KEY not set gemini calls will fail.")
 
 
 
 
 
 
 
 
 
 
26
 
27
+ client = genai.Client(api_key=GEMINI_API_KEY)
 
 
 
 
 
 
 
 
28
 
29
+ app = Flask(__name__)
30
+ CORS(app)
31
+
32
+ # Helper: read uploaded image bytes -> BGR numpy
33
+ def read_image_bytes(file_storage) -> (np.ndarray, int, int, bytes):
34
+ data = file_storage.read()
35
+ img = Image.open(io.BytesIO(data)).convert("RGB")
36
+ w, h = img.size
37
+ arr = np.array(img)[:, :, ::-1] # RGB -> BGR
38
+ return arr, w, h, data
39
+
40
+ # Helper: crop bgr image by pixel rect and return base64 jpeg
41
+ def crop_and_b64(bgr_img: np.ndarray, x: int, y: int, w: int, h: int, max_side=512) -> str:
42
+ h_img, w_img = bgr_img.shape[:2]
43
+ x = max(0, int(x)); y = max(0, int(y))
44
+ x2 = min(w_img, int(x + w)); y2 = min(h_img, int(y + h))
45
+ crop = bgr_img[y:y2, x:x2]
46
+ if crop.size == 0:
47
+ return ""
48
+ # resize if too large
49
+ max_dim = max(crop.shape[0], crop.shape[1])
50
+ if max_dim > max_side:
51
+ scale = max_side / max_dim
52
+ crop = cv2.resize(crop, (int(crop.shape[1] * scale), int(crop.shape[0] * scale)), interpolation=cv2.INTER_AREA)
53
+ _, jpeg = cv2.imencode(".jpg", crop, [int(cv2.IMWRITE_JPEG_QUALITY), 82])
54
+ return base64.b64encode(jpeg.tobytes()).decode("ascii")
55
+
56
+ # Fallback: simple contour detection cropping
57
+ def fallback_contour_crops(bgr_img, max_items=8) -> List[Dict[str, Any]]:
58
+ gray = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2GRAY)
59
+ blur = cv2.GaussianBlur(gray, (7,7), 0)
60
+ thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY_INV,15,6)
61
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
62
+ closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
63
+ contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
64
+ h_img, w_img = bgr_img.shape[:2]
65
+ min_area = (w_img*h_img) * 0.005
66
+ items = []
67
+ for cnt in sorted(contours, key=cv2.contourArea, reverse=True):
68
+ if len(items) >= max_items:
69
+ break
70
+ area = cv2.contourArea(cnt)
71
+ if area < min_area:
72
+ continue
73
+ x,y,w,h = cv2.boundingRect(cnt)
74
+ pad_x, pad_y = int(w*0.07), int(h*0.07)
75
+ x = max(0, x - pad_x); y = max(0, y - pad_y)
76
+ w = min(w_img - x, w + pad_x*2); h = min(h_img - y, h + pad_y*2)
77
+ b64 = crop_and_b64(bgr_img, x, y, w, h)
78
+ if not b64:
79
+ continue
80
+ items.append({
81
+ "id": str(uuid.uuid4()),
82
+ "label": "unknown",
83
+ "confidence": min(0.95, max(0.25, area/(w_img*h_img))),
84
+ "bbox": {"x": x, "y": y, "w": w, "h": h},
85
+ "thumbnail_b64": b64,
86
+ "source": "fallback"
87
+ })
88
+ # if still none, split into grid
89
+ if not items:
90
+ h_half, w_half = h_img//2, w_img//2
91
+ rects = [
92
+ (0,0,w_half,h_half), (w_half,0,w_half,h_half),
93
+ (0,h_half,w_half,h_half), (w_half,h_half,w_half,h_half)
94
+ ]
95
+ for r in rects:
96
+ b64 = crop_and_b64(bgr_img, r[0], r[1], r[2], r[3])
97
+ if b64:
98
+ items.append({
99
+ "id": str(uuid.uuid4()),
100
+ "label": "unknown",
101
+ "confidence": 0.3,
102
+ "bbox": {"x": r[0], "y": r[1], "w": r[2], "h": r[3]},
103
+ "thumbnail_b64": b64,
104
+ "source": "fallback-grid"
105
+ })
106
+ return items
107
+
108
+ # Main route: process image using Gemini for detection -> crop thumbnails with OpenCV
109
+ @app.route("/process", methods=["POST"])
110
+ def process_image():
111
+ if "photo" not in request.files:
112
+ return jsonify({"error": "missing photo"}), 400
113
+ file = request.files["photo"]
114
  try:
115
+ bgr_img, img_w, img_h, raw_bytes = read_image_bytes(file)
 
 
 
 
 
 
116
  except Exception as e:
117
+ log.error("invalid image: %s", e)
118
+ return jsonify({"error": "invalid image"}), 400
119
+
120
+ # Build a prompt instructing Gemini to detect garments and return normalized bbox list
121
+ user_prompt = (
122
+ "You are an assistant that extracts clothing detections from a single image. "
123
+ "Return a JSON object with a single key 'items' which is an array. Each item must have: "
124
+ "label (string, short like 'top','skirt','sneakers'), "
125
+ "bbox with normalized coordinates between 0 and 1: {x, y, w, h} where x,y are top-left relative to width/height, "
126
+ "confidence (0-1). Example output: {\"items\":[{\"label\":\"top\",\"bbox\":{\"x\":0.1,\"y\":0.2,\"w\":0.3,\"h\":0.4},\"confidence\":0.95}]} "
127
+ "Output ONLY valid JSON. If you cannot detect any clothing confidently, return {\"items\":[]}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ # Prepare request contents: prompt text + image bytes
131
+ try:
132
+ contents = [
133
+ types.Content(role="user", parts=[types.Part.from_text(text=user_prompt)])
134
+ ]
135
+ # include image bytes as a user part (like other examples)
136
+ contents.append(types.Content(role="user", parts=[types.Part.from_bytes(data=raw_bytes, mime_type="image/jpeg")]))
137
+
138
+ # Force JSON response schema: top-level object with items array
139
+ schema = {
140
  "type": "object",
141
  "properties": {
142
+ "items": {
143
+ "type": "array",
144
+ "items": {
145
+ "type": "object",
146
+ "properties": {
147
+ "label": {"type": "string"},
148
+ "bbox": {
149
+ "type": "object",
150
+ "properties": {
151
+ "x": {"type": "number"},
152
+ "y": {"type": "number"},
153
+ "w": {"type": "number"},
154
+ "h": {"type": "number"}
155
+ },
156
+ "required": ["x","y","w","h"]
157
+ },
158
+ "confidence": {"type": "number"}
159
+ },
160
+ "required": ["label","bbox","confidence"]
161
+ }
162
+ }
163
  },
164
+ "required": ["items"]
165
  }
 
166
 
167
+ cfg = types.GenerateContentConfig(response_mime_type="application/json", response_schema=schema)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # Call Gemini model
170
+ log.info("Calling Gemini model for detection (gemini-2.5-flash-lite)...")
171
+ model_resp = client.models.generate_content(model="gemini-2.5-flash-lite", contents=contents, config=cfg)
172
+ raw_text = model_resp.text or ""
173
+ log.info("Gemini raw response length: %d", len(raw_text))
 
 
 
 
 
 
 
 
174
 
175
+ # Try parse JSON
176
+ parsed = None
 
177
  try:
178
+ parsed = json.loads(raw_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  except Exception as e:
180
+ log.warning("Could not parse Gemini JSON: %s", e)
181
+ parsed = None
182
+
183
+ items_out = []
184
+ if parsed and isinstance(parsed.get("items"), list) and len(parsed["items"])>0:
185
+ for it in parsed["items"]:
186
+ try:
187
+ label = str(it.get("label","unknown"))[:48]
188
+ bbox = it.get("bbox",{})
189
+ # bbox in normalized coords -> to pixels
190
+ nx = float(bbox.get("x",0))
191
+ ny = float(bbox.get("y",0))
192
+ nw = float(bbox.get("w",0))
193
+ nh = float(bbox.get("h",0))
194
+ # clamp 0..1
195
+ nx = max(0.0, min(1.0, nx)); ny = max(0.0,min(1.0,ny))
196
+ nw = max(0.0, min(1.0, nw)); nh = max(0.0, min(1.0, nh))
197
+ px = int(nx * img_w); py = int(ny * img_h)
198
+ pw = int(nw * img_w); ph = int(nh * img_h)
199
+ # guard tiny
200
+ if pw <= 8 or ph <= 8:
201
+ continue
202
+ b64 = crop_and_b64(bgr_img, px, py, pw, ph)
203
+ if not b64:
204
+ continue
205
+ items_out.append({
206
+ "id": str(uuid.uuid4()),
207
+ "label": label,
208
+ "confidence": float(it.get("confidence", 0.5)),
209
+ "bbox": {"x": px, "y": py, "w": pw, "h": ph},
210
+ "thumbnail_b64": b64,
211
+ "source": "gemini"
212
+ })
213
+ except Exception as e:
214
+ log.warning("skipping item due to error: %s", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  else:
216
+ # Fallback to contour heuristic
217
+ log.info("Gemini returned no items or parse failed using fallback contour crops.")
218
+ items_out = fallback_contour_crops(bgr_img, max_items=8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ return jsonify({"ok": True, "items": items_out, "debug": {"raw_model_text": raw_text[:1600]}}), 200
 
221
 
222
+ except Exception as ex:
223
+ log.exception("Processing error: %s", ex)
224
+ # final fallback: contour crops
225
+ try:
226
+ items_out = fallback_contour_crops(bgr_img, max_items=8)
227
+ return jsonify({"ok": True, "items": items_out, "debug": {"error": str(ex)}}), 200
228
+ except Exception as e2:
229
+ log.exception("Fallback also failed: %s", e2)
230
+ return jsonify({"error": "internal failure", "detail": str(e2)}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  if __name__ == "__main__":
233
+ port = int(os.getenv("PORT", 5000))
234
+ log.info("Starting server on 0.0.0.0:%d", port)
235
+ app.run(host="0.0.0.0", port=port, debug=True)