Rox-Turbo commited on
Commit
7a3cf9a
ยท
verified ยท
1 Parent(s): 369291f

Upload 21 files

Browse files
Files changed (4) hide show
  1. private/admin/admin.css +0 -0
  2. public/app.js +128 -2
  3. public/styles.css +87 -0
  4. server.js +156 -68
private/admin/admin.css CHANGED
The diff for this file is too large to render. See raw diff
 
public/app.js CHANGED
@@ -3866,6 +3866,11 @@ class RoxAI {
3866
  this._updateDeepResearchStatus(data.status);
3867
  continue;
3868
  }
 
 
 
 
 
3869
  // Handle DeepResearch started notification
3870
  if (data.deepResearchStarted && data.info) {
3871
  this._showDeepResearchNotice(data.info, data.searchCount, data.articlesRead);
@@ -4972,6 +4977,46 @@ class RoxAI {
4972
  formData.append('chatId', this.currentConversationId || '');
4973
  formData.append('conversationHistory', JSON.stringify(historyUpTo));
4974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4975
  this.requestController = new AbortController();
4976
 
4977
  // Track streaming response for partial save on cancel
@@ -8417,6 +8462,87 @@ class RoxAI {
8417
  }
8418
  }
8419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8420
  /**
8421
  * Update typing status with elapsed time and status messages
8422
  * @private
@@ -8425,8 +8551,8 @@ class RoxAI {
8425
  const statusEl = document.getElementById('typingStatus');
8426
  if (!statusEl || !this._typingStartTime) return;
8427
 
8428
- // Don't override DeepResearch status
8429
- if (statusEl.classList.contains('deep-research-status')) {
8430
  return;
8431
  }
8432
 
 
3866
  this._updateDeepResearchStatus(data.status);
3867
  continue;
3868
  }
3869
+ // Handle Vision processing real-time status updates
3870
+ if (data.visionStatus && data.status) {
3871
+ this._updateVisionStatus(data.status, data.phase, data.currentImage, data.totalImages, data.model);
3872
+ continue;
3873
+ }
3874
  // Handle DeepResearch started notification
3875
  if (data.deepResearchStarted && data.info) {
3876
  this._showDeepResearchNotice(data.info, data.searchCount, data.articlesRead);
 
4977
  formData.append('chatId', this.currentConversationId || '');
4978
  formData.append('conversationHistory', JSON.stringify(historyUpTo));
4979
 
4980
+ // ==================== REGENERATE WITH ORIGINAL IMAGES ====================
4981
+ // If the original message had image attachments, include them in regeneration
4982
+ // This ensures vision analysis works correctly on regenerate
4983
+ if (userMsg.attachments && userMsg.attachments.length > 0) {
4984
+ const imageAttachments = userMsg.attachments.filter(att =>
4985
+ att.type && att.type.startsWith('image/') && att.preview
4986
+ );
4987
+
4988
+ if (imageAttachments.length > 0) {
4989
+ console.log(`๐Ÿ–ผ๏ธ Regenerating with ${imageAttachments.length} original image(s)`);
4990
+
4991
+ // Convert base64 previews back to File objects for upload
4992
+ for (const att of imageAttachments) {
4993
+ try {
4994
+ // Extract base64 data from data URL
4995
+ const base64Match = att.preview.match(/^data:([^;]+);base64,(.+)$/);
4996
+ if (base64Match) {
4997
+ const mimeType = base64Match[1];
4998
+ const base64Data = base64Match[2];
4999
+
5000
+ // Convert base64 to Blob
5001
+ const byteCharacters = atob(base64Data);
5002
+ const byteNumbers = new Array(byteCharacters.length);
5003
+ for (let i = 0; i < byteCharacters.length; i++) {
5004
+ byteNumbers[i] = byteCharacters.charCodeAt(i);
5005
+ }
5006
+ const byteArray = new Uint8Array(byteNumbers);
5007
+ const blob = new Blob([byteArray], { type: mimeType });
5008
+
5009
+ // Create File from Blob
5010
+ const file = new File([blob], att.name, { type: mimeType });
5011
+ formData.append('files', file);
5012
+ }
5013
+ } catch (imgErr) {
5014
+ console.warn('Failed to restore image for regeneration:', att.name, imgErr);
5015
+ }
5016
+ }
5017
+ }
5018
+ }
5019
+
5020
  this.requestController = new AbortController();
5021
 
5022
  // Track streaming response for partial save on cancel
 
8462
  }
8463
  }
8464
 
8465
+ /**
8466
+ * Update Vision processing status in real-time
8467
+ * Shows premium status updates during image analysis pipeline
8468
+ * @param {string} status - Current status message
8469
+ * @param {string} phase - Current phase (analyzing, fallback, sending, generating, error)
8470
+ * @param {number|null} currentImage - Current image being processed (1-based)
8471
+ * @param {number|null} totalImages - Total number of images
8472
+ * @param {string} model - The main LLM model name
8473
+ * @private
8474
+ */
8475
+ _updateVisionStatus(status, phase, currentImage, totalImages, model) {
8476
+ const statusEl = document.getElementById('typingStatus');
8477
+ const typingIndicator = document.querySelector('.typing-indicator');
8478
+
8479
+ // Format display status based on phase
8480
+ let displayStatus = status;
8481
+ let shortStatus = status;
8482
+
8483
+ // Create premium phase-specific messages
8484
+ switch (phase) {
8485
+ case 'analyzing':
8486
+ if (totalImages === 1) {
8487
+ shortStatus = '๐Ÿ‘๏ธ Rox Vision analyzing your image...';
8488
+ displayStatus = '๐Ÿ‘๏ธ Rox Vision is carefully analyzing your image...';
8489
+ } else if (currentImage && totalImages) {
8490
+ shortStatus = `๐Ÿ‘๏ธ Analyzing image ${currentImage}/${totalImages}...`;
8491
+ displayStatus = `๐Ÿ‘๏ธ Rox Vision analyzing image ${currentImage} of ${totalImages}...`;
8492
+ }
8493
+ break;
8494
+ case 'analyzed':
8495
+ if (totalImages === 1) {
8496
+ shortStatus = 'โœ… Image analyzed';
8497
+ displayStatus = 'โœ… Image analysis complete';
8498
+ } else if (currentImage && totalImages) {
8499
+ shortStatus = `โœ… Image ${currentImage}/${totalImages} done`;
8500
+ displayStatus = `โœ… Image ${currentImage} of ${totalImages} analyzed`;
8501
+ }
8502
+ break;
8503
+ case 'fallback':
8504
+ shortStatus = '๐Ÿ”„ Switching to Rox Vision Max...';
8505
+ displayStatus = '๐Ÿ”„ Primary vision busy, Rox Vision Max taking over...';
8506
+ break;
8507
+ case 'sending':
8508
+ shortStatus = `๐Ÿ“ค Sending to ${model}...`;
8509
+ displayStatus = `๐Ÿ“ค Image context ready! Sending to ${model}...`;
8510
+ break;
8511
+ case 'generating':
8512
+ shortStatus = `๐Ÿง  ${model} is thinking...`;
8513
+ displayStatus = `๐Ÿง  ${model} is crafting your response...`;
8514
+ break;
8515
+ case 'error':
8516
+ shortStatus = 'โš ๏ธ Vision unavailable';
8517
+ displayStatus = status;
8518
+ break;
8519
+ }
8520
+
8521
+ // Update the status element
8522
+ if (statusEl) {
8523
+ statusEl.textContent = displayStatus;
8524
+ statusEl.classList.add('vision-status');
8525
+ statusEl.classList.remove('deep-research-status');
8526
+ }
8527
+
8528
+ // Update the typing indicator
8529
+ if (typingIndicator) {
8530
+ typingIndicator.classList.add('vision-active');
8531
+ typingIndicator.classList.remove('deep-research-active');
8532
+
8533
+ const textEl = typingIndicator.querySelector('.typing-text');
8534
+ if (textEl) {
8535
+ textEl.textContent = shortStatus;
8536
+ }
8537
+
8538
+ // Add progress indicator for multiple images
8539
+ if (totalImages > 1 && currentImage) {
8540
+ const progressPercent = Math.round((currentImage / totalImages) * 100);
8541
+ typingIndicator.style.setProperty('--vision-progress', `${progressPercent}%`);
8542
+ }
8543
+ }
8544
+ }
8545
+
8546
  /**
8547
  * Update typing status with elapsed time and status messages
8548
  * @private
 
8551
  const statusEl = document.getElementById('typingStatus');
8552
  if (!statusEl || !this._typingStartTime) return;
8553
 
8554
+ // Don't override DeepResearch or Vision status
8555
+ if (statusEl.classList.contains('deep-research-status') || statusEl.classList.contains('vision-status')) {
8556
  return;
8557
  }
8558
 
public/styles.css CHANGED
@@ -7510,6 +7510,93 @@ textarea:focus:not(:focus-visible) {
7510
  background: #10b981;
7511
  }
7512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7513
  /* ===== ATTACH MENU ===== */
7514
  .attach-menu {
7515
  position: fixed;
 
7510
  background: #10b981;
7511
  }
7512
 
7513
+ /* ===== VISION STATUS STYLES ===== */
7514
+ /* Premium Vision Processing Status */
7515
+ .vision-status {
7516
+ color: #8b5cf6 !important;
7517
+ font-weight: 500;
7518
+ animation: visionStatusPulse 2s ease-in-out infinite;
7519
+ }
7520
+
7521
+ /* Vision Status Animation */
7522
+ @keyframes visionStatusPulse {
7523
+ 0%, 100% {
7524
+ opacity: 1;
7525
+ }
7526
+ 50% {
7527
+ opacity: 0.7;
7528
+ }
7529
+ }
7530
+
7531
+ /* Enhanced Typing Indicator for Vision Processing */
7532
+ .typing-indicator.vision-active {
7533
+ background: linear-gradient(135deg, rgba(139, 92, 246, 0.1) 0%, rgba(167, 139, 250, 0.15) 100%);
7534
+ border: 1px solid rgba(139, 92, 246, 0.3);
7535
+ position: relative;
7536
+ overflow: hidden;
7537
+ }
7538
+
7539
+ .typing-indicator.vision-active::before {
7540
+ content: '';
7541
+ position: absolute;
7542
+ top: 0;
7543
+ left: 0;
7544
+ height: 100%;
7545
+ width: var(--vision-progress, 0%);
7546
+ background: linear-gradient(90deg, rgba(139, 92, 246, 0.15) 0%, rgba(167, 139, 250, 0.2) 100%);
7547
+ transition: width 0.3s ease;
7548
+ z-index: 0;
7549
+ }
7550
+
7551
+ .typing-indicator.vision-active .typing-text {
7552
+ color: #8b5cf6;
7553
+ font-weight: 500;
7554
+ position: relative;
7555
+ z-index: 1;
7556
+ }
7557
+
7558
+ .typing-indicator.vision-active .typing-dots span {
7559
+ background: #8b5cf6;
7560
+ position: relative;
7561
+ z-index: 1;
7562
+ }
7563
+
7564
+ /* Vision Eye Animation */
7565
+ @keyframes visionEyePulse {
7566
+ 0%, 100% {
7567
+ transform: scale(1);
7568
+ }
7569
+ 50% {
7570
+ transform: scale(1.1);
7571
+ }
7572
+ }
7573
+
7574
+ .typing-indicator.vision-active .typing-dots {
7575
+ animation: visionEyePulse 1.5s ease-in-out infinite;
7576
+ }
7577
+
7578
+ /* Light theme adjustments */
7579
+ [data-theme="light"] .vision-status {
7580
+ color: #7c3aed !important;
7581
+ }
7582
+
7583
+ [data-theme="light"] .typing-indicator.vision-active {
7584
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.08) 0%, rgba(139, 92, 246, 0.12) 100%);
7585
+ border-color: rgba(124, 58, 237, 0.25);
7586
+ }
7587
+
7588
+ [data-theme="light"] .typing-indicator.vision-active::before {
7589
+ background: linear-gradient(90deg, rgba(124, 58, 237, 0.1) 0%, rgba(139, 92, 246, 0.15) 100%);
7590
+ }
7591
+
7592
+ [data-theme="light"] .typing-indicator.vision-active .typing-text {
7593
+ color: #7c3aed;
7594
+ }
7595
+
7596
+ [data-theme="light"] .typing-indicator.vision-active .typing-dots span {
7597
+ background: #7c3aed;
7598
+ }
7599
+
7600
  /* ===== ATTACH MENU ===== */
7601
  .attach-menu {
7602
  position: fixed;
server.js CHANGED
@@ -10293,95 +10293,144 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
10293
  return false;
10294
  };
10295
 
10296
- if (hasImages) {
10297
- log.info(`๐Ÿ–ผ๏ธ Step 1: Rox Vision analyzing image(s)...`);
10298
-
10299
- // Prepare vision analysis request
10300
- const visionMessages = [
10301
- { role: 'system', content: ROX_VISION_ANALYSIS_PROMPT },
10302
- {
10303
- role: 'user',
10304
- content: [
10305
- { type: 'text', text: `Please analyze the following image(s) thoroughly. The user's question is: "${message}"\n\nProvide a detailed analysis that will help answer their question.` },
10306
- ...imageContents
10307
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10308
  }
10309
- ];
10310
-
10311
  try {
10312
- // Call primary Rox Vision for image analysis (non-streaming for speed)
10313
- const visionResponse = await openai.chat.completions.create({
10314
- model: config.visionModel,
10315
- messages: visionMessages,
10316
- temperature: 0.3, // Lower temperature for more consistent analysis
10317
- top_p: 0.9,
10318
- max_tokens: 4096, // Enough for detailed analysis
10319
- stream: false
10320
- });
 
 
 
 
 
10321
 
10322
- visionAnalysis = visionResponse.choices?.[0]?.message?.content || null;
 
 
10323
 
10324
- // Check if vision model refused
10325
- if (visionAnalysis && isVisionRefusal(visionAnalysis)) {
10326
- log.warn(`โš ๏ธ Primary Rox Vision refused, trying Rox Vision Max...`);
10327
 
10328
- // Try fallback vision model (Rox Vision Max)
10329
- const fallbackVisionMessages = [
10330
- { role: 'system', content: ROX_VISION_MAX_ANALYSIS_PROMPT },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10331
  {
10332
  role: 'user',
10333
  content: [
10334
- { type: 'text', text: `Please analyze the following image(s) thoroughly. The user's question is: "${message}"\n\nProvide a detailed analysis.` },
10335
- ...imageContents
10336
  ]
10337
  }
10338
  ];
10339
 
10340
- const fallbackResponse = await openai.chat.completions.create({
10341
- model: config.fallbackVisionModel,
10342
- messages: fallbackVisionMessages,
10343
  temperature: 0.3,
10344
  top_p: 0.9,
10345
  max_tokens: 4096,
10346
  stream: false
10347
  });
10348
 
10349
- const fallbackAnalysis = fallbackResponse.choices?.[0]?.message?.content || null;
10350
 
10351
- // Check if fallback also refused
10352
- if (fallbackAnalysis && !isVisionRefusal(fallbackAnalysis)) {
10353
- visionAnalysis = fallbackAnalysis;
10354
- usingFallbackVision = true;
10355
- log.info(`๐Ÿ‘๏ธโœจ Rox Vision Max analysis complete`);
10356
- } else {
10357
- log.warn(`โš ๏ธ Rox Vision Max also refused or returned empty`);
10358
- visionAnalysis = null;
10359
  }
10360
- } else if (visionAnalysis) {
10361
- log.info(`๐Ÿ‘๏ธ Rox Vision analysis complete`);
10362
- } else {
10363
- log.warn(`โš ๏ธ Rox Vision returned empty response`);
 
 
 
10364
  }
10365
- } catch (visionError) {
10366
- log.error(`โŒ Vision analysis error: ${visionError.message || 'Unknown'}`);
10367
 
10368
- // Try fallback on error
10369
  try {
10370
- log.info(`๐Ÿ”„ Trying Rox Vision Max as fallback...`);
10371
- const fallbackVisionMessages = [
 
 
 
10372
  { role: 'system', content: ROX_VISION_MAX_ANALYSIS_PROMPT },
10373
  {
10374
  role: 'user',
10375
  content: [
10376
- { type: 'text', text: `Please analyze the following image(s). User question: "${message}"` },
10377
- ...imageContents
10378
  ]
10379
  }
10380
  ];
10381
 
10382
  const fallbackResponse = await openai.chat.completions.create({
10383
  model: config.fallbackVisionModel,
10384
- messages: fallbackVisionMessages,
10385
  temperature: 0.3,
10386
  top_p: 0.9,
10387
  max_tokens: 4096,
@@ -10390,20 +10439,53 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
10390
 
10391
  const fallbackAnalysis = fallbackResponse.choices?.[0]?.message?.content || null;
10392
 
10393
- // Check if fallback also refused
10394
  if (fallbackAnalysis && !isVisionRefusal(fallbackAnalysis)) {
10395
- visionAnalysis = fallbackAnalysis;
10396
- usingFallbackVision = true;
10397
- log.info(`๐Ÿ‘๏ธโœจ Rox Vision Max fallback analysis complete`);
10398
- } else {
10399
- log.warn(`โš ๏ธ Rox Vision Max also refused or returned empty`);
10400
- visionAnalysis = null;
10401
  }
 
 
 
 
10402
  } catch (fallbackError) {
10403
- log.error(`โŒ Fallback vision also failed: ${fallbackError.message || 'Unknown'}`);
10404
- // Set to null so the main LLM handles the request without vision analysis
10405
- visionAnalysis = null;
10406
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10407
  }
10408
  }
10409
 
@@ -10417,10 +10499,16 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
10417
  // Inject vision analysis into the user message for the main LLM
10418
  // The main LLM will use this analysis to generate the response
10419
  const visionSibling = usingFallbackVision ? 'Rox Vision Max' : 'Rox Vision';
 
 
 
 
 
 
10420
  const visionContext = `
10421
  ## ๐Ÿ‘๏ธ IMAGE ANALYSIS FROM YOUR VISION SIBLING (${visionSibling})
10422
 
10423
- Your vision sibling has analyzed the attached image(s) and provided the following information:
10424
 
10425
  ${visionAnalysis}
10426
 
@@ -10428,7 +10516,7 @@ ${visionAnalysis}
10428
 
10429
  **USER'S ORIGINAL QUESTION:** ${message}
10430
 
10431
- **YOUR TASK:** Using the image analysis above from your vision sibling, provide a helpful response to the user's question. You can reference the visual details your sibling identified. Remember, you're working as a team - your vision sibling sees the image, and you provide the intelligent response!
10432
  `;
10433
 
10434
  // Replace the last user message with the enhanced version (text only, no images)
 
10293
  return false;
10294
  };
10295
 
10296
+ // ==================== VISION STATUS HELPER ====================
10297
+ // Helper function to send vision processing status updates to frontend
10298
+ // Defined here so it can be used in both vision processing blocks
10299
+ let visionSseHeadersSent = false;
10300
+
10301
+ /**
10302
+ * Send vision status update to frontend
10303
+ * @param {string} status - Status message to display
10304
+ * @param {string} phase - Current phase (analyzing, fallback, sending, generating, error)
10305
+ * @param {number} [currentImage] - Current image number (1-based)
10306
+ * @param {number} [totalImgs] - Total number of images
10307
+ */
10308
+ const sendVisionStatus = (status, phase, currentImage = null, totalImgs = null) => {
10309
+ // Ensure SSE headers are set up
10310
+ if (!sseHeadersSent && !visionSseHeadersSent) {
10311
+ try {
10312
+ res.setHeader('Content-Type', 'text/event-stream');
10313
+ res.setHeader('Cache-Control', 'no-cache, no-transform');
10314
+ res.setHeader('Connection', 'keep-alive');
10315
+ res.setHeader('X-Accel-Buffering', 'no');
10316
+ res.setHeader('Content-Encoding', 'none');
10317
+ res.setHeader('Transfer-Encoding', 'chunked');
10318
+ res.flushHeaders();
10319
+ visionSseHeadersSent = true;
10320
+ sseHeadersSent = true;
10321
+ } catch (e) {
10322
+ // Headers may already be sent
10323
  }
10324
+ }
10325
+
10326
  try {
10327
+ const statusEvent = {
10328
+ visionStatus: true,
10329
+ status: status,
10330
+ phase: phase,
10331
+ currentImage: currentImage,
10332
+ totalImages: totalImgs,
10333
+ model: config.name
10334
+ };
10335
+ res.write(`data: ${JSON.stringify(statusEvent)}\n\n`);
10336
+ if (typeof res.flush === 'function') res.flush();
10337
+ } catch (e) {
10338
+ // Ignore write errors - connection may have closed
10339
+ }
10340
+ };
10341
 
10342
+ if (hasImages) {
10343
+ const totalImages = imageContents.length;
10344
+ log.info(`๐Ÿ–ผ๏ธ Step 1: Rox Vision analyzing ${totalImages} image(s) sequentially...`);
10345
 
10346
+ // Send initial vision status
10347
+ const imageText = totalImages === 1 ? 'your image' : `${totalImages} images`;
10348
+ sendVisionStatus(`๐Ÿ‘๏ธ Rox Vision is analyzing ${imageText}...`, 'analyzing', 1, totalImages);
10349
 
10350
+ // ==================== SEQUENTIAL IMAGE PROCESSING ====================
10351
+ // Vision models only support 1 image at a time, so we process each image
10352
+ // separately and combine the analyses with clear image numbering
10353
+ const imageAnalyses = [];
10354
+
10355
+ /**
10356
+ * Analyze a single image with primary vision model, fallback to Rox Vision Max if needed
10357
+ * @param {Object} imageContent - Single image content object
10358
+ * @param {number} imageIndex - 1-based index of the image
10359
+ * @returns {Promise<{analysis: string|null, usedFallback: boolean}>}
10360
+ */
10361
+ const analyzeSingleImage = async (imageContent, imageIndex) => {
10362
+ const imageLabel = totalImages > 1 ? `Image ${imageIndex} of ${totalImages}` : 'the image';
10363
+ const imagePrompt = totalImages > 1
10364
+ ? `You are analyzing ${imageLabel}. The user has uploaded ${totalImages} images total.\n\nUser's question: "${message}"\n\nProvide a detailed analysis of THIS specific image (Image ${imageIndex}). Start your analysis with "**Image ${imageIndex}:**" so the user knows which image you're describing.`
10365
+ : `Please analyze the following image thoroughly. The user's question is: "${message}"\n\nProvide a detailed analysis that will help answer their question.`;
10366
+
10367
+ // Send status: analyzing this specific image
10368
+ if (totalImages > 1) {
10369
+ sendVisionStatus(`๐Ÿ‘๏ธ Rox Vision analyzing image ${imageIndex} of ${totalImages}...`, 'analyzing', imageIndex, totalImages);
10370
+ }
10371
+
10372
+ // Try primary Rox Vision first
10373
+ try {
10374
+ const visionMessages = [
10375
+ { role: 'system', content: ROX_VISION_ANALYSIS_PROMPT },
10376
  {
10377
  role: 'user',
10378
  content: [
10379
+ { type: 'text', text: imagePrompt },
10380
+ imageContent
10381
  ]
10382
  }
10383
  ];
10384
 
10385
+ const visionResponse = await openai.chat.completions.create({
10386
+ model: config.visionModel,
10387
+ messages: visionMessages,
10388
  temperature: 0.3,
10389
  top_p: 0.9,
10390
  max_tokens: 4096,
10391
  stream: false
10392
  });
10393
 
10394
+ const analysis = visionResponse.choices?.[0]?.message?.content || null;
10395
 
10396
+ // Check if vision model refused
10397
+ if (analysis && !isVisionRefusal(analysis)) {
10398
+ log.info(`๐Ÿ‘๏ธ Rox Vision analyzed Image ${imageIndex}/${totalImages}`);
10399
+ // Send success status for this image
10400
+ if (totalImages > 1) {
10401
+ sendVisionStatus(`โœ… Image ${imageIndex} analyzed successfully`, 'analyzed', imageIndex, totalImages);
10402
+ }
10403
+ return { analysis, usedFallback: false };
10404
  }
10405
+
10406
+ // Primary refused, try fallback
10407
+ log.warn(`โš ๏ธ Rox Vision refused Image ${imageIndex}, trying Rox Vision Max...`);
10408
+ sendVisionStatus(`๐Ÿ”„ Switching to Rox Vision Max for image ${totalImages > 1 ? imageIndex : ''}...`, 'fallback', imageIndex, totalImages);
10409
+ } catch (primaryError) {
10410
+ log.warn(`โš ๏ธ Rox Vision error on Image ${imageIndex}: ${primaryError.message || 'Unknown'}`);
10411
+ sendVisionStatus(`๐Ÿ”„ Trying Rox Vision Max...`, 'fallback', imageIndex, totalImages);
10412
  }
 
 
10413
 
10414
+ // Try Rox Vision Max as fallback
10415
  try {
10416
+ const fallbackPrompt = totalImages > 1
10417
+ ? `You are analyzing ${imageLabel}. The user has uploaded ${totalImages} images total.\n\nUser's question: "${message}"\n\nProvide a detailed analysis of THIS specific image (Image ${imageIndex}). Start your analysis with "**Image ${imageIndex}:**"`
10418
+ : `Please analyze the following image. User question: "${message}"`;
10419
+
10420
+ const fallbackMessages = [
10421
  { role: 'system', content: ROX_VISION_MAX_ANALYSIS_PROMPT },
10422
  {
10423
  role: 'user',
10424
  content: [
10425
+ { type: 'text', text: fallbackPrompt },
10426
+ imageContent
10427
  ]
10428
  }
10429
  ];
10430
 
10431
  const fallbackResponse = await openai.chat.completions.create({
10432
  model: config.fallbackVisionModel,
10433
+ messages: fallbackMessages,
10434
  temperature: 0.3,
10435
  top_p: 0.9,
10436
  max_tokens: 4096,
 
10439
 
10440
  const fallbackAnalysis = fallbackResponse.choices?.[0]?.message?.content || null;
10441
 
 
10442
  if (fallbackAnalysis && !isVisionRefusal(fallbackAnalysis)) {
10443
+ log.info(`๐Ÿ‘๏ธโœจ Rox Vision Max analyzed Image ${imageIndex}/${totalImages}`);
10444
+ if (totalImages > 1) {
10445
+ sendVisionStatus(`โœ… Image ${imageIndex} analyzed by Rox Vision Max`, 'analyzed', imageIndex, totalImages);
10446
+ }
10447
+ return { analysis: fallbackAnalysis, usedFallback: true };
 
10448
  }
10449
+
10450
+ log.warn(`โš ๏ธ Rox Vision Max also refused Image ${imageIndex}`);
10451
+ sendVisionStatus(`โš ๏ธ Could not analyze image ${totalImages > 1 ? imageIndex : ''}`, 'error', imageIndex, totalImages);
10452
+ return { analysis: null, usedFallback: true };
10453
  } catch (fallbackError) {
10454
+ log.error(`โŒ Both vision models failed on Image ${imageIndex}: ${fallbackError.message || 'Unknown'}`);
10455
+ return { analysis: null, usedFallback: true };
 
10456
  }
10457
+ };
10458
+
10459
+ // Process each image sequentially (vision models don't support multiple images)
10460
+ let anyUsedFallback = false;
10461
+ for (let i = 0; i < imageContents.length; i++) {
10462
+ const result = await analyzeSingleImage(imageContents[i], i + 1);
10463
+ if (result.analysis) {
10464
+ imageAnalyses.push(result.analysis);
10465
+ }
10466
+ if (result.usedFallback) {
10467
+ anyUsedFallback = true;
10468
+ }
10469
+ }
10470
+
10471
+ // Combine all image analyses
10472
+ if (imageAnalyses.length > 0) {
10473
+ if (totalImages === 1) {
10474
+ // Single image - use analysis directly
10475
+ visionAnalysis = imageAnalyses[0];
10476
+ } else {
10477
+ // Multiple images - combine with clear separation
10478
+ visionAnalysis = `## Analysis of ${totalImages} Images\n\n` + imageAnalyses.join('\n\n---\n\n');
10479
+ }
10480
+ usingFallbackVision = anyUsedFallback;
10481
+ log.info(`๐Ÿ‘๏ธ Vision analysis complete: ${imageAnalyses.length}/${totalImages} images analyzed`);
10482
+
10483
+ // Send status: sending image context to main LLM
10484
+ sendVisionStatus(`๐Ÿ“ค Sending image context to ${config.name}...`, 'sending', totalImages, totalImages);
10485
+ } else {
10486
+ log.warn(`โš ๏ธ No images could be analyzed`);
10487
+ sendVisionStatus(`โš ๏ธ Vision analysis unavailable`, 'error', 0, totalImages);
10488
+ visionAnalysis = null;
10489
  }
10490
  }
10491
 
 
10499
  // Inject vision analysis into the user message for the main LLM
10500
  // The main LLM will use this analysis to generate the response
10501
  const visionSibling = usingFallbackVision ? 'Rox Vision Max' : 'Rox Vision';
10502
+ const imageCount = imageContents.length;
10503
+ const imageCountText = imageCount === 1 ? '1 image' : `${imageCount} images`;
10504
+
10505
+ // Send status: main LLM is now generating response
10506
+ sendVisionStatus(`๐Ÿง  ${config.name} is thinking about your ${imageCountText}...`, 'generating', imageCount, imageCount);
10507
+
10508
  const visionContext = `
10509
  ## ๐Ÿ‘๏ธ IMAGE ANALYSIS FROM YOUR VISION SIBLING (${visionSibling})
10510
 
10511
+ Your vision sibling has analyzed ${imageCountText} attached by the user and provided the following information:
10512
 
10513
  ${visionAnalysis}
10514
 
 
10516
 
10517
  **USER'S ORIGINAL QUESTION:** ${message}
10518
 
10519
+ **YOUR TASK:** Using the image analysis above from your vision sibling, provide a helpful response to the user's question. ${imageCount > 1 ? `The user uploaded ${imageCount} images - make sure to address each image in your response (e.g., "In the 1st image...", "In the 2nd image...").` : 'You can reference the visual details your sibling identified.'} Remember, you're working as a team - your vision sibling sees the image(s), and you provide the intelligent response!
10520
  `;
10521
 
10522
  // Replace the last user message with the enhanced version (text only, no images)