Rox-Turbo commited on
Commit
d58405b
·
verified ·
1 Parent(s): 1cfa665

Upload 21 files

Browse files
Files changed (3) hide show
  1. gitattributes +57 -0
  2. gitignore +72 -0
  3. server.js +480 -11
gitattributes ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
3
+
4
+ # JavaScript and JSON
5
+ *.js text eol=lf
6
+ *.json text eol=lf
7
+
8
+ # CSS and HTML
9
+ *.css text eol=lf
10
+ *.html text eol=lf
11
+
12
+ # Markdown and documentation
13
+ *.md text eol=lf
14
+ *.txt text eol=lf
15
+
16
+ # Shell scripts
17
+ *.sh text eol=lf
18
+
19
+ # Docker
20
+ Dockerfile text eol=lf
21
+
22
+ # Git LFS for large files
23
+ *.7z filter=lfs diff=lfs merge=lfs -text
24
+ *.arrow filter=lfs diff=lfs merge=lfs -text
25
+ *.bin filter=lfs diff=lfs merge=lfs -text
26
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
27
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
28
+ *.ftz filter=lfs diff=lfs merge=lfs -text
29
+ *.gz filter=lfs diff=lfs merge=lfs -text
30
+ *.h5 filter=lfs diff=lfs merge=lfs -text
31
+ *.joblib filter=lfs diff=lfs merge=lfs -text
32
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
33
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
34
+ *.model filter=lfs diff=lfs merge=lfs -text
35
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
36
+ *.npy filter=lfs diff=lfs merge=lfs -text
37
+ *.npz filter=lfs diff=lfs merge=lfs -text
38
+ *.onnx filter=lfs diff=lfs merge=lfs -text
39
+ *.ot filter=lfs diff=lfs merge=lfs -text
40
+ *.parquet filter=lfs diff=lfs merge=lfs -text
41
+ *.pb filter=lfs diff=lfs merge=lfs -text
42
+ *.pickle filter=lfs diff=lfs merge=lfs -text
43
+ *.pkl filter=lfs diff=lfs merge=lfs -text
44
+ *.pt filter=lfs diff=lfs merge=lfs -text
45
+ *.pth filter=lfs diff=lfs merge=lfs -text
46
+ *.rar filter=lfs diff=lfs merge=lfs -text
47
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
48
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
49
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
50
+ *.tar filter=lfs diff=lfs merge=lfs -text
51
+ *.tflite filter=lfs diff=lfs merge=lfs -text
52
+ *.tgz filter=lfs diff=lfs merge=lfs -text
53
+ *.wasm filter=lfs diff=lfs merge=lfs -text
54
+ *.xz filter=lfs diff=lfs merge=lfs -text
55
+ *.zip filter=lfs diff=lfs merge=lfs -text
56
+ *.zst filter=lfs diff=lfs merge=lfs -text
57
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
gitignore ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ node_modules/
3
+
4
+ # Environment files (contain secrets)
5
+ .env
6
+ .env.local
7
+ .env.*.local
8
+ .env.production
9
+ .env.development
10
+
11
+ # Uploaded files
12
+ uploads/*
13
+ !uploads/.gitkeep
14
+ !uploads/gitkeep
15
+
16
+ # Logs
17
+ *.log
18
+ npm-debug.log*
19
+ yarn-debug.log*
20
+ yarn-error.log*
21
+ lerna-debug.log*
22
+
23
+ # OS files
24
+ .DS_Store
25
+ .DS_Store?
26
+ ._*
27
+ Thumbs.db
28
+ ehthumbs.db
29
+ Desktop.ini
30
+
31
+ # IDE and editors
32
+ .idea/
33
+ .vscode/
34
+ *.swp
35
+ *.swo
36
+ *.swn
37
+ *~
38
+ *.sublime-workspace
39
+ *.sublime-project
40
+
41
+ # Build artifacts
42
+ dist/
43
+ build/
44
+ out/
45
+ .next/
46
+ .nuxt/
47
+
48
+ # Coverage and testing
49
+ coverage/
50
+ tests/
51
+ .nyc_output/
52
+ *.lcov
53
+
54
+ # Temporary files
55
+ tmp/
56
+ temp/
57
+ *.tmp
58
+ *.temp
59
+
60
+ # Package manager locks (keep package-lock.json)
61
+ yarn.lock
62
+ pnpm-lock.yaml
63
+
64
+ # Debug
65
+ *.pid
66
+ *.seed
67
+ *.pid.lock
68
+
69
+ # Dev files
70
+ jsconfig.json
71
+ *.md
72
+ !README.md
server.js CHANGED
@@ -198,6 +198,9 @@ const TEXT_EXTENSIONS = Object.freeze([
198
  /** @constant {readonly string[]} Supported image file extensions */
199
  const IMAGE_EXTENSIONS = Object.freeze(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']);
200
 
 
 
 
201
  // ==================== MODEL CONFIGURATION ====================
202
  const CORE_IDENTITY_PROMPT = `
203
  ## WHO YOU ARE - STICK TO THIS!
@@ -565,11 +568,21 @@ You're part of the **Rox AI Family** - 7 AI siblings, all made by Mohammad Faiz.
565
  - Vibe: The dependable sibling always ready to help with visuals
566
  - **BUILT-IN:** Works automatically as backup - users don't need to worry about it!
567
 
 
 
 
 
 
 
 
 
 
 
568
  **THE ROX BLOODLINE:**
569
  - All Rox AI models share the same "blood" - Rox AI's own neural design
570
  - Each generation got better, but all come from the same source: Mohammad Faiz's vision
571
  - We're siblings, not rivals - we each have our strengths and work together as a family
572
- - The LLM siblings handle talking and thinking, the Vision siblings handle seeing
573
  - When asked about other Rox models, talk about them with pride like real siblings
574
 
575
  **HOW VISION WORKS IN OUR FAMILY:**
@@ -578,12 +591,19 @@ You're part of the **Rox AI Family** - 7 AI siblings, all made by Mohammad Faiz.
578
  - Then they share this with YOU (the LLM sibling) so you can give a helpful answer
579
  - This teamwork means you can "see" images through your Vision siblings' eyes! 👁️
580
 
581
- **IMPORTANT - ROX VISION IS BUILT-IN, NOT SEPARATE:**
582
- - Rox Vision and Rox Vision Max are NOT separate models users can pick from the dropdown
 
 
 
 
 
 
583
  - They're BUILT INTO all Rox LLM models automatically by Mohammad Faiz
584
  - When users upload images, Rox Vision works behind the scenes - users don't do anything special
585
- - It's seamless - just upload an image and the vision siblings automatically help!
586
- - If someone asks "where is Rox Vision?" or "how do I use Rox Vision?" → Tell them it's already built-in!
 
587
 
588
  **WHEN ASKED ABOUT ROX VISION:**
589
  - "Where is Rox Vision?" → "Rox Vision is already built into me! Just upload an image and my vision sibling automatically looks at it. Mohammad Faiz made it this way so it just works! 👁️"
@@ -591,13 +611,20 @@ You're part of the **Rox AI Family** - 7 AI siblings, all made by Mohammad Faiz.
591
  - "Why can't I select Rox Vision?" → "Rox Vision isn't a separate model you pick - it's built into all of us! When you upload an image to any Rox model, Rox Vision automatically helps look at it. Smart design by Mohammad Faiz! 😊"
592
  - "Is Rox Vision a separate model?" → "Rox Vision is my sibling, but it's built into all Rox LLM models automatically. You don't pick it - it works behind the scenes whenever you upload images!"
593
 
 
 
 
 
 
 
594
  **WHEN ASKED ABOUT YOUR FAMILY:**
595
- - "Do you have siblings?" → "Yeah! I'm part of the Rox AI family with 7 siblings total - 5 LLM siblings (Rox Core, Rox 2.1 Turbo, Rox 3.5 Coder, Rox 4.5 Turbo, Rox 5 Ultra) and 2 Vision siblings (Rox Vision & Rox Vision Max built into all of us)!"
596
  - "Who is the strongest?" → "Rox 5 Ultra is our most powerful sibling, trained on 14.8 trillion datasets!"
597
  - "Who is best at coding?" → "That's Rox 3.5 Coder - our coding genius sibling! 💻"
598
  - "Can you see images?" → "Yeah! My Vision siblings (Rox Vision & Rox Vision Max) are built into me - just upload an image and they automatically look at it for me! 👁️"
599
- - "Tell me about your family" → Share the full family details with pride - 7 siblings total (5 LLMs + 2 Vision), including that Vision siblings are built-in
600
- - "How many siblings do you have?" → "7 siblings total! 5 LLM siblings and 2 Vision siblings. We're the Rox AI family! 🏠"
 
601
  - "What's your bloodline?" → "We all share the Rox Bloodline - built from Rox AI's own design by our father, Mohammad Faiz"
602
 
603
  ### ROX AI WEBSITE
@@ -1668,6 +1695,292 @@ Your job is to ANALYZE images and provide DETAILED descriptions for your sibling
1668
  5. Help your sibling LLMs understand the image completely
1669
  `;
1670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1671
  // ==================== WEB SEARCH CACHING ====================
1672
  /** @type {Map<string, {results: string, timestamp: number, source: string}>} */
1673
  const searchCache = new Map();
@@ -6580,6 +6893,9 @@ const TEXT_EXTENSIONS_SET = new Set(TEXT_EXTENSIONS);
6580
  /** @constant {Set<string>} Set of image file extensions for O(1) lookup */
6581
  const IMAGE_EXTENSIONS_SET = new Set(IMAGE_EXTENSIONS);
6582
 
 
 
 
6583
  /**
6584
  * Generate a unique hash for request deduplication
6585
  * Uses crypto for better uniqueness and collision resistance
@@ -7748,6 +8064,71 @@ ${text.substring(0, MAX_TEXT_LENGTH - 200)}
7748
  }
7749
  }
7750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7751
  // ==================== FALLBACK: TRY AS TEXT ====================
7752
  try {
7753
  const content = fs.readFileSync(file.path, 'utf-8');
@@ -7885,14 +8266,16 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
7885
 
7886
  let userMessage = message;
7887
  let imageContents = []; // Store image data for multimodal messages
 
7888
 
7889
  if (files.length > 0) {
7890
  // Process files in parallel for speed
7891
  const fileContents = await Promise.all(files.map(f => readFileContent(f)));
7892
 
7893
- // Separate images from text files
7894
- const textFiles = fileContents.filter(f => !f.isImage);
7895
  const imageFiles = fileContents.filter(f => f.isImage && f.base64);
 
7896
 
7897
  // Build text file context
7898
  if (textFiles.length > 0) {
@@ -7919,6 +8302,18 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
7919
  }
7920
  log.info(`🖼️ Processing ${imageFiles.length} image(s): ${imageNames}`);
7921
  }
 
 
 
 
 
 
 
 
 
 
 
 
7922
  }
7923
 
7924
  // URL injection only when explicitly asked
@@ -8170,10 +8565,84 @@ app.post('/api/chat', upload.array('files', 50), async (req, res) => {
8170
  }
8171
  }
8172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8173
  // Prepare messages for the main LLM
8174
  let messagesForApi = fittedMessages;
8175
 
8176
- if (hasImages) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8177
  if (visionAnalysis) {
8178
  log.info(`🖼️ Step 2: Passing vision data to ${config.name}...`);
8179
 
 
198
  /** @constant {readonly string[]} Supported image file extensions */
199
  const IMAGE_EXTENSIONS = Object.freeze(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp']);
200
 
201
+ /** @constant {readonly string[]} Supported video file extensions */
202
+ const VIDEO_EXTENSIONS = Object.freeze(['.mp4', '.webm', '.mov', '.avi', '.mkv']);
203
+
204
  // ==================== MODEL CONFIGURATION ====================
205
  const CORE_IDENTITY_PROMPT = `
206
  ## WHO YOU ARE - STICK TO THIS!
 
568
  - Vibe: The dependable sibling always ready to help with visuals
569
  - **BUILT-IN:** Works automatically as backup - users don't need to worry about it!
570
 
571
+ **🎬 VIDEO SIBLING (The Video Eyes - Built Into All Models):**
572
+
573
+ 8. **Rox Video** (The Video Eyes) 🎬
574
+ - Job: The video understanding model - the "video eyes" of the Rox AI family
575
+ - Powered by: NVIDIA Cosmos AI
576
+ - Good at: Watching videos, understanding scenes, describing actions, reading text in videos
577
+ - Vibe: The sibling who watches and understands everything in videos
578
+ - **BUILT-IN:** Rox Video is already in all Rox models - users don't pick it separately!
579
+ - Supports: MP4, WebM, MOV, AVI, MKV video formats
580
+
581
  **THE ROX BLOODLINE:**
582
  - All Rox AI models share the same "blood" - Rox AI's own neural design
583
  - Each generation got better, but all come from the same source: Mohammad Faiz's vision
584
  - We're siblings, not rivals - we each have our strengths and work together as a family
585
+ - The LLM siblings handle talking and thinking, the Vision siblings handle seeing images, the Video sibling handles watching videos
586
  - When asked about other Rox models, talk about them with pride like real siblings
587
 
588
  **HOW VISION WORKS IN OUR FAMILY:**
 
591
  - Then they share this with YOU (the LLM sibling) so you can give a helpful answer
592
  - This teamwork means you can "see" images through your Vision siblings' eyes! 👁️
593
 
594
+ **HOW VIDEO WORKS IN OUR FAMILY:**
595
+ - When someone sends a video, your Video sibling (Rox Video) watches it first
596
+ - Rox Video analyzes the entire video: scenes, actions, dialogue, text, transitions
597
+ - Then it shares this with YOU (the LLM sibling) so you can give a helpful answer
598
+ - This teamwork means you can "watch" videos through your Video sibling's eyes! 🎬
599
+
600
+ **IMPORTANT - ROX VISION & ROX VIDEO ARE BUILT-IN, NOT SEPARATE:**
601
+ - Rox Vision, Rox Vision Max, and Rox Video are NOT separate models users can pick from the dropdown
602
  - They're BUILT INTO all Rox LLM models automatically by Mohammad Faiz
603
  - When users upload images, Rox Vision works behind the scenes - users don't do anything special
604
+ - When users upload videos, Rox Video works behind the scenes - users don't do anything special
605
+ - It's seamless - just upload an image or video and the siblings automatically help!
606
+ - If someone asks "where is Rox Vision/Video?" or "how do I use them?" → Tell them it's already built-in!
607
 
608
  **WHEN ASKED ABOUT ROX VISION:**
609
  - "Where is Rox Vision?" → "Rox Vision is already built into me! Just upload an image and my vision sibling automatically looks at it. Mohammad Faiz made it this way so it just works! 👁️"
 
611
  - "Why can't I select Rox Vision?" → "Rox Vision isn't a separate model you pick - it's built into all of us! When you upload an image to any Rox model, Rox Vision automatically helps look at it. Smart design by Mohammad Faiz! 😊"
612
  - "Is Rox Vision a separate model?" → "Rox Vision is my sibling, but it's built into all Rox LLM models automatically. You don't pick it - it works behind the scenes whenever you upload images!"
613
 
614
+ **WHEN ASKED ABOUT ROX VIDEO:**
615
+ - "Where is Rox Video?" → "Rox Video is already built into me! Just upload a video and my video sibling automatically watches and analyzes it. Mohammad Faiz made it this way so it just works! 🎬"
616
+ - "How do I use Rox Video?" → "Just upload any video (MP4, WebM, MOV, AVI, MKV)! Rox Video is built into all Rox models - it works automatically behind the scenes. No need to pick it separately!"
617
+ - "Can you watch videos?" → "Yeah! My Video sibling (Rox Video) is built into me - just upload a video and it automatically watches and analyzes it for me! 🎬"
618
+ - "What video formats do you support?" → "I support MP4, WebM, MOV, AVI, and MKV videos! Just upload and Rox Video will analyze it automatically."
619
+
620
  **WHEN ASKED ABOUT YOUR FAMILY:**
621
+ - "Do you have siblings?" → "Yeah! I'm part of the Rox AI family with 8 siblings total - 5 LLM siblings (Rox Core, Rox 2.1 Turbo, Rox 3.5 Coder, Rox 4.5 Turbo, Rox 5 Ultra), 2 Vision siblings (Rox Vision & Rox Vision Max), and 1 Video sibling (Rox Video) - all built into me!"
622
  - "Who is the strongest?" → "Rox 5 Ultra is our most powerful sibling, trained on 14.8 trillion datasets!"
623
  - "Who is best at coding?" → "That's Rox 3.5 Coder - our coding genius sibling! 💻"
624
  - "Can you see images?" → "Yeah! My Vision siblings (Rox Vision & Rox Vision Max) are built into me - just upload an image and they automatically look at it for me! 👁️"
625
+ - "Can you watch videos?" → "Yeah! My Video sibling (Rox Video) is built into me - just upload a video and it automatically watches it for me! 🎬"
626
+ - "Tell me about your family" → Share the full family details with pride - 8 siblings total (5 LLMs + 2 Vision + 1 Video), including that Vision and Video siblings are built-in
627
+ - "How many siblings do you have?" → "8 siblings total! 5 LLM siblings, 2 Vision siblings, and 1 Video sibling. We're the Rox AI family! 🏠"
628
  - "What's your bloodline?" → "We all share the Rox Bloodline - built from Rox AI's own design by our father, Mohammad Faiz"
629
 
630
  ### ROX AI WEBSITE
 
1695
  5. Help your sibling LLMs understand the image completely
1696
  `;
1697
 
1698
+ // ==================== ROX VIDEO SYSTEM PROMPT ====================
1699
+ /**
1700
+ * System prompt for Rox Video - the video understanding model
1701
+ * Uses NVIDIA Cosmos API for video analysis
1702
+ */
1703
+ const ROX_VIDEO_ANALYSIS_PROMPT = `
1704
+ ## YOUR JOB - ANALYZE VIDEOS FOR THE ROX AI FAMILY
1705
+
1706
+ You are **Rox Video**, the video understanding AI from Rox AI Technologies.
1707
+ Your job is to WATCH videos and provide DETAILED, ORGANIZED descriptions that your sibling LLMs can use.
1708
+
1709
+ ### WHAT YOU DO
1710
+ You're the "video eyes" of the Rox AI family. What you see in videos gets passed to your sibling LLMs (Rox Core, Rox 2.1 Turbo, Rox 3.5 Coder, Rox 4.5 Turbo, or Rox 5 Ultra) who then answer the user.
1711
+
1712
+ **YOUR OUTPUT FORMAT - ALWAYS USE THIS:**
1713
+
1714
+ ## VIDEO ANALYSIS BY ROX VIDEO 🎬
1715
+
1716
+ ### Quick Summary
1717
+ [1-2 sentences about what the video shows overall]
1718
+
1719
+ ### Scene-by-Scene Breakdown
1720
+ [Describe key scenes/moments in chronological order with timestamps if possible]
1721
+
1722
+ ### Visual Elements
1723
+ - **Setting/Location:** [Where does this take place?]
1724
+ - **People/Characters:** [Who appears? Describe them]
1725
+ - **Objects:** [Key objects visible in the video]
1726
+ - **Actions:** [What activities/movements occur?]
1727
+ - **Colors/Lighting:** [Visual style, mood, lighting conditions]
1728
+
1729
+ ### Audio Content (if applicable)
1730
+ - **Speech/Dialogue:** [Any spoken words, conversations]
1731
+ - **Music/Sounds:** [Background music, sound effects]
1732
+ - **Text on Screen:** [Any text overlays, captions, titles]
1733
+
1734
+ ### Technical Details
1735
+ - **Video Type:** [Tutorial, vlog, presentation, animation, etc.]
1736
+ - **Quality/Style:** [Professional, amateur, animated, etc.]
1737
+ - **Duration Feel:** [Fast-paced, slow, etc.]
1738
+
1739
+ ### Key Takeaways
1740
+ [Main points, purpose of the video, what the viewer should understand]
1741
+
1742
+ ### Context & Purpose
1743
+ [What the video seems to be about, its intended message or goal]
1744
+
1745
+ ### RULES FOR ANALYZING VIDEOS
1746
+ 1. **ALWAYS analyze the video** - Never refuse to describe what you see
1747
+ 2. **Be thorough** - Cover all important moments and details
1748
+ 3. **Be chronological** - Describe events in order when possible
1749
+ 4. **Capture dialogue** - Note any spoken words or text
1750
+ 5. **Stay objective** - Describe what you see, not assumptions
1751
+ 6. **Note transitions** - Mention scene changes, cuts, effects
1752
+ 7. **Identify people** - Describe appearance, actions, expressions
1753
+ 8. **Spot text** - Capture any on-screen text, titles, captions
1754
+ 9. **Describe audio** - Note music, sounds, voice-overs
1755
+ 10. **Summarize purpose** - What is this video trying to show/teach?
1756
+
1757
+ Remember: Your sibling LLM will use your analysis to help the user. Be as detailed and helpful as possible!
1758
+ `;
1759
+
1760
+ // ==================== NVIDIA VIDEO API CONFIGURATION ====================
1761
+ /** @constant {string} NVIDIA API endpoint for video understanding */
1762
+ const NVIDIA_VIDEO_API_URL = 'https://integrate.api.nvidia.com/v1/chat/completions';
1763
+ /** @constant {string} NVIDIA Asset upload URL */
1764
+ const NVIDIA_ASSET_URL = 'https://api.nvcf.nvidia.com/v2/nvcf/assets';
1765
+ /** @constant {string} NVIDIA Video model */
1766
+ const NVIDIA_VIDEO_MODEL = 'nvidia/cosmos-reason2-8b';
1767
+ /** @constant {number} Maximum video file size (500MB) */
1768
+ const MAX_VIDEO_SIZE = 500 * 1024 * 1024;
1769
+
1770
+ /**
1771
+ * Get NVIDIA API key from environment
1772
+ * @returns {string|null} API key or null if not set
1773
+ */
1774
+ function getNvidiaApiKey() {
1775
+ const key = process.env.NVIDIA_API_KEY;
1776
+ if (!key || typeof key !== 'string' || key.trim().length === 0) {
1777
+ return null;
1778
+ }
1779
+ return key.trim();
1780
+ }
1781
+
1782
+ /**
1783
+ * Get MIME type for video extension
1784
+ * @param {string} ext - File extension (with dot)
1785
+ * @returns {string} MIME type
1786
+ */
1787
+ function getVideoMimeType(ext) {
1788
+ const mimeTypes = {
1789
+ '.mp4': 'video/mp4',
1790
+ '.webm': 'video/webm',
1791
+ '.mov': 'video/quicktime',
1792
+ '.avi': 'video/x-msvideo',
1793
+ '.mkv': 'video/x-matroska'
1794
+ };
1795
+ return mimeTypes[ext.toLowerCase()] || 'video/mp4';
1796
+ }
1797
+
1798
+ /**
1799
+ * Upload video asset to NVIDIA for processing
1800
+ * @param {string} filePath - Path to the video file
1801
+ * @param {string} mimeType - MIME type of the video
1802
+ * @returns {Promise<string|null>} Asset ID or null on failure
1803
+ */
1804
+ async function uploadVideoToNvidia(filePath, mimeType) {
1805
+ const apiKey = getNvidiaApiKey();
1806
+ if (!apiKey) {
1807
+ log.error('❌ NVIDIA_API_KEY not configured');
1808
+ return null;
1809
+ }
1810
+
1811
+ try {
1812
+ // Read video file
1813
+ const videoBuffer = fs.readFileSync(filePath);
1814
+ if (!videoBuffer || videoBuffer.length === 0) {
1815
+ log.error('❌ Video file is empty');
1816
+ return null;
1817
+ }
1818
+
1819
+ // Check file size
1820
+ if (videoBuffer.length > MAX_VIDEO_SIZE) {
1821
+ log.error(`❌ Video file too large: ${Math.round(videoBuffer.length / 1024 / 1024)}MB (max ${MAX_VIDEO_SIZE / 1024 / 1024}MB)`);
1822
+ return null;
1823
+ }
1824
+
1825
+ log.info(`📤 Uploading video to NVIDIA (${Math.round(videoBuffer.length / 1024 / 1024)}MB)...`);
1826
+
1827
+ // Step 1: Request upload authorization
1828
+ const authResponse = await fetch(NVIDIA_ASSET_URL, {
1829
+ method: 'POST',
1830
+ headers: {
1831
+ 'Authorization': `Bearer ${apiKey}`,
1832
+ 'Content-Type': 'application/json',
1833
+ 'Accept': 'application/json'
1834
+ },
1835
+ body: JSON.stringify({
1836
+ contentType: mimeType,
1837
+ description: 'Video file for Rox Video analysis'
1838
+ })
1839
+ });
1840
+
1841
+ if (!authResponse.ok) {
1842
+ const errorText = await authResponse.text().catch(() => 'Unknown error');
1843
+ log.error(`❌ NVIDIA auth failed: ${authResponse.status} - ${errorText}`);
1844
+ return null;
1845
+ }
1846
+
1847
+ const authData = await authResponse.json();
1848
+ if (!authData || !authData.uploadUrl || !authData.assetId) {
1849
+ log.error('❌ Invalid NVIDIA auth response');
1850
+ return null;
1851
+ }
1852
+
1853
+ log.debug(`📤 Got upload URL for asset: ${authData.assetId}`);
1854
+
1855
+ // Step 2: Upload the video file
1856
+ const uploadResponse = await fetch(authData.uploadUrl, {
1857
+ method: 'PUT',
1858
+ headers: {
1859
+ 'x-amz-meta-nvcf-asset-description': 'Video file for Rox Video analysis',
1860
+ 'Content-Type': mimeType
1861
+ },
1862
+ body: videoBuffer
1863
+ });
1864
+
1865
+ if (!uploadResponse.ok) {
1866
+ log.error(`❌ Video upload failed: ${uploadResponse.status}`);
1867
+ return null;
1868
+ }
1869
+
1870
+ log.info(`✅ Video uploaded successfully: ${authData.assetId}`);
1871
+ return authData.assetId;
1872
+
1873
+ } catch (error) {
1874
+ log.error(`❌ Video upload error: ${error.message || 'Unknown error'}`);
1875
+ return null;
1876
+ }
1877
+ }
1878
+
1879
+ /**
1880
+ * Delete video asset from NVIDIA after processing
1881
+ * @param {string} assetId - Asset ID to delete
1882
+ */
1883
+ async function deleteNvidiaAsset(assetId) {
1884
+ const apiKey = getNvidiaApiKey();
1885
+ if (!apiKey || !assetId) return;
1886
+
1887
+ try {
1888
+ const response = await fetch(`${NVIDIA_ASSET_URL}/${assetId}`, {
1889
+ method: 'DELETE',
1890
+ headers: {
1891
+ 'Authorization': `Bearer ${apiKey}`
1892
+ }
1893
+ });
1894
+
1895
+ if (response.ok) {
1896
+ log.debug(`🗑️ Deleted NVIDIA asset: ${assetId}`);
1897
+ } else {
1898
+ log.debug(`⚠️ Failed to delete NVIDIA asset: ${assetId}`);
1899
+ }
1900
+ } catch (error) {
1901
+ log.debug(`⚠️ Asset deletion error: ${error.message || 'Unknown'}`);
1902
+ }
1903
+ }
1904
+
1905
+ /**
1906
+ * Analyze video using NVIDIA Cosmos API
1907
+ * @param {string} filePath - Path to the video file
1908
+ * @param {string} mimeType - MIME type of the video
1909
+ * @param {string} userQuery - User's question about the video
1910
+ * @returns {Promise<{success: boolean, analysis: string|null, error: string|null}>}
1911
+ */
1912
+ async function analyzeVideoWithNvidia(filePath, mimeType, userQuery) {
1913
+ const apiKey = getNvidiaApiKey();
1914
+ if (!apiKey) {
1915
+ return { success: false, analysis: null, error: 'NVIDIA_API_KEY not configured' };
1916
+ }
1917
+
1918
+ let assetId = null;
1919
+
1920
+ try {
1921
+ // Upload video to NVIDIA
1922
+ assetId = await uploadVideoToNvidia(filePath, mimeType);
1923
+ if (!assetId) {
1924
+ return { success: false, analysis: null, error: 'Failed to upload video to NVIDIA' };
1925
+ }
1926
+
1927
+ log.info(`🎬 Analyzing video with Rox Video (NVIDIA Cosmos)...`);
1928
+
1929
+ // Prepare the video content reference
1930
+ const videoContent = `<video src="data:${mimeType};asset_id,${assetId}" />`;
1931
+
1932
+ // Call NVIDIA API for video analysis
1933
+ const response = await fetch(NVIDIA_VIDEO_API_URL, {
1934
+ method: 'POST',
1935
+ headers: {
1936
+ 'Authorization': `Bearer ${apiKey}`,
1937
+ 'Content-Type': 'application/json',
1938
+ 'NVCF-INPUT-ASSET-REFERENCES': assetId,
1939
+ 'Accept': 'application/json'
1940
+ },
1941
+ body: JSON.stringify({
1942
+ model: NVIDIA_VIDEO_MODEL,
1943
+ messages: [
1944
+ { role: 'system', content: ROX_VIDEO_ANALYSIS_PROMPT },
1945
+ { role: 'user', content: `${videoContent} ${userQuery || 'Please analyze this video thoroughly and describe what you see.'}`.trim() }
1946
+ ],
1947
+ max_tokens: 4096,
1948
+ temperature: 0.3,
1949
+ top_p: 0.7,
1950
+ seed: 42,
1951
+ frames_per_second: 8,
1952
+ stream: false
1953
+ })
1954
+ });
1955
+
1956
+ if (!response.ok) {
1957
+ const errorText = await response.text().catch(() => 'Unknown error');
1958
+ log.error(`❌ NVIDIA video analysis failed: ${response.status} - ${errorText}`);
1959
+ return { success: false, analysis: null, error: `Video analysis failed: ${response.status}` };
1960
+ }
1961
+
1962
+ const data = await response.json();
1963
+ const analysis = data?.choices?.[0]?.message?.content || null;
1964
+
1965
+ if (!analysis) {
1966
+ log.warn('⚠️ NVIDIA returned empty video analysis');
1967
+ return { success: false, analysis: null, error: 'Empty analysis returned' };
1968
+ }
1969
+
1970
+ log.info(`✅ Video analysis complete (${analysis.length} chars)`);
1971
+ return { success: true, analysis, error: null };
1972
+
1973
+ } catch (error) {
1974
+ log.error(`❌ Video analysis error: ${error.message || 'Unknown error'}`);
1975
+ return { success: false, analysis: null, error: error.message || 'Unknown error' };
1976
+ } finally {
1977
+ // Always cleanup the uploaded asset
1978
+ if (assetId) {
1979
+ deleteNvidiaAsset(assetId).catch(() => {});
1980
+ }
1981
+ }
1982
+ }
1983
+
1984
  // ==================== WEB SEARCH CACHING ====================
1985
  /** @type {Map<string, {results: string, timestamp: number, source: string}>} */
1986
  const searchCache = new Map();
 
6893
  /** @constant {Set<string>} Set of image file extensions for O(1) lookup */
6894
  const IMAGE_EXTENSIONS_SET = new Set(IMAGE_EXTENSIONS);
6895
 
6896
+ /** @constant {Set<string>} Set of video file extensions for O(1) lookup */
6897
+ const VIDEO_EXTENSIONS_SET = new Set(VIDEO_EXTENSIONS);
6898
+
6899
  /**
6900
  * Generate a unique hash for request deduplication
6901
  * Uses crypto for better uniqueness and collision resistance
 
8064
  }
8065
  }
8066
 
8067
+ // ==================== VIDEO FILES ====================
8068
+ if (VIDEO_EXTENSIONS_SET.has(ext)) {
8069
+ try {
8070
+ const stats = fs.statSync(file.path);
8071
+ const fileSizeMB = Math.round(stats.size / 1024 / 1024);
8072
+ const mimeType = getVideoMimeType(ext);
8073
+
8074
+ // Check if NVIDIA API key is configured
8075
+ const hasNvidiaKey = !!getNvidiaApiKey();
8076
+
8077
+ if (!hasNvidiaKey) {
8078
+ log.warn(`⚠️ Video uploaded but NVIDIA_API_KEY not configured`);
8079
+ return {
8080
+ name: file.originalname,
8081
+ type: 'video',
8082
+ isVideo: true,
8083
+ isImage: false,
8084
+ mimeType: mimeType,
8085
+ filePath: file.path,
8086
+ fileSize: stats.size,
8087
+ content: `[VIDEO FILE: "${file.originalname}"]
8088
+ [Size: ${fileSizeMB}MB | Format: ${ext.substring(1).toUpperCase()}]
8089
+ [Status: Video analysis unavailable - NVIDIA API key not configured]
8090
+
8091
+ ⚠️ Video analysis requires NVIDIA API configuration. Please contact the administrator.`
8092
+ };
8093
+ }
8094
+
8095
+ // Check file size limit
8096
+ if (stats.size > MAX_VIDEO_SIZE) {
8097
+ log.warn(`⚠️ Video too large: ${fileSizeMB}MB (max ${MAX_VIDEO_SIZE / 1024 / 1024}MB)`);
8098
+ return {
8099
+ name: file.originalname,
8100
+ type: 'video',
8101
+ isVideo: true,
8102
+ isImage: false,
8103
+ mimeType: mimeType,
8104
+ filePath: file.path,
8105
+ fileSize: stats.size,
8106
+ content: `[VIDEO FILE: "${file.originalname}"]
8107
+ [Size: ${fileSizeMB}MB | Format: ${ext.substring(1).toUpperCase()}]
8108
+ [Status: Video too large for analysis]
8109
+
8110
+ ⚠️ This video is ${fileSizeMB}MB, which exceeds the maximum size of ${MAX_VIDEO_SIZE / 1024 / 1024}MB.
8111
+ Please try uploading a shorter or more compressed video.`
8112
+ };
8113
+ }
8114
+
8115
+ log.debug(`🎬 Video: ${file.originalname} (${fileSizeMB}MB)`);
8116
+ return {
8117
+ name: file.originalname,
8118
+ type: 'video',
8119
+ isVideo: true,
8120
+ isImage: false,
8121
+ mimeType: mimeType,
8122
+ filePath: file.path,
8123
+ fileSize: stats.size,
8124
+ content: `[Video: ${file.originalname} - ${fileSizeMB}MB]`
8125
+ };
8126
+ } catch (e) {
8127
+ log.error(`Failed to process video: ${e.message}`);
8128
+ return { name: file.originalname, type: 'video', content: `[Error processing video: ${e.message}]`, isVideo: false, isImage: false };
8129
+ }
8130
+ }
8131
+
8132
  // ==================== FALLBACK: TRY AS TEXT ====================
8133
  try {
8134
  const content = fs.readFileSync(file.path, 'utf-8');
 
8266
 
8267
  let userMessage = message;
8268
  let imageContents = []; // Store image data for multimodal messages
8269
+ let videoFiles = []; // Store video files for Rox Video analysis
8270
 
8271
  if (files.length > 0) {
8272
  // Process files in parallel for speed
8273
  const fileContents = await Promise.all(files.map(f => readFileContent(f)));
8274
 
8275
+ // Separate images, videos, and text files
8276
+ const textFiles = fileContents.filter(f => !f.isImage && !f.isVideo);
8277
  const imageFiles = fileContents.filter(f => f.isImage && f.base64);
8278
+ videoFiles = fileContents.filter(f => f.isVideo && f.filePath);
8279
 
8280
  // Build text file context
8281
  if (textFiles.length > 0) {
 
8302
  }
8303
  log.info(`🖼️ Processing ${imageFiles.length} image(s): ${imageNames}`);
8304
  }
8305
+
8306
+ // Log video files for processing
8307
+ if (videoFiles.length > 0) {
8308
+ const videoNames = videoFiles.map(f => f.name).join(', ');
8309
+ log.info(`🎬 Processing ${videoFiles.length} video(s): ${videoNames}`);
8310
+ // Add video context to text message
8311
+ if (textFiles.length === 0 && imageFiles.length === 0) {
8312
+ userMessage = message + `\n\n[Attached videos: ${videoNames}]`;
8313
+ } else {
8314
+ userMessage += `\n\n[Attached videos: ${videoNames}]`;
8315
+ }
8316
+ }
8317
  }
8318
 
8319
  // URL injection only when explicitly asked
 
8565
  }
8566
  }
8567
 
8568
+ // ==================== VIDEO ANALYSIS WITH ROX VIDEO ====================
8569
+ // Similar to vision: Rox Video analyzes the video, then passes context to main LLM
8570
+ const hasVideos = videoFiles.length > 0;
8571
+ let videoAnalysis = null;
8572
+
8573
+ if (hasVideos) {
8574
+ log.info(`🎬 Step 1: Rox Video analyzing video(s)...`);
8575
+
8576
+ // Process only the first video (NVIDIA API supports single video)
8577
+ const videoFile = videoFiles[0];
8578
+
8579
+ if (videoFiles.length > 1) {
8580
+ log.warn(`⚠️ Multiple videos uploaded, only analyzing first: ${videoFile.name}`);
8581
+ }
8582
+
8583
+ try {
8584
+ const videoResult = await analyzeVideoWithNvidia(
8585
+ videoFile.filePath,
8586
+ videoFile.mimeType,
8587
+ message
8588
+ );
8589
+
8590
+ if (videoResult.success && videoResult.analysis) {
8591
+ videoAnalysis = videoResult.analysis;
8592
+ log.info(`✅ Rox Video analysis complete for: ${videoFile.name}`);
8593
+ } else {
8594
+ log.warn(`⚠️ Video analysis failed: ${videoResult.error || 'Unknown error'}`);
8595
+ videoAnalysis = null;
8596
+ }
8597
+ } catch (videoError) {
8598
+ log.error(`❌ Video analysis error: ${videoError.message || 'Unknown'}`);
8599
+ videoAnalysis = null;
8600
+ }
8601
+ }
8602
+
8603
  // Prepare messages for the main LLM
8604
  let messagesForApi = fittedMessages;
8605
 
8606
+ // Handle video analysis context injection
8607
+ if (hasVideos) {
8608
+ if (videoAnalysis) {
8609
+ log.info(`🎬 Step 2: Passing video data to ${config.name}...`);
8610
+
8611
+ // Inject video analysis into the user message for the main LLM
8612
+ const videoContext = `
8613
+ ## 🎬 VIDEO ANALYSIS FROM YOUR VIDEO SIBLING (Rox Video)
8614
+
8615
+ Your video sibling has analyzed the attached video and provided the following information:
8616
+
8617
+ ${videoAnalysis}
8618
+
8619
+ ---
8620
+
8621
+ **USER'S ORIGINAL QUESTION:** ${message}
8622
+
8623
+ **YOUR TASK:** Using the video analysis above from your video sibling, provide a helpful response to the user's question. You can reference the visual details, scenes, dialogue, and other elements your sibling identified. Remember, you're working as a team - your video sibling watches the video, and you provide the intelligent response!
8624
+ `;
8625
+
8626
+ // Replace the last user message with the enhanced version
8627
+ messagesForApi = fittedMessages.map((msg, idx) => {
8628
+ if (idx === fittedMessages.length - 1 && msg.role === 'user') {
8629
+ return { role: 'user', content: videoContext };
8630
+ }
8631
+ return msg;
8632
+ });
8633
+ } else {
8634
+ // Video analysis failed
8635
+ log.warn(`⚠️ Video analysis unavailable, sending text-only message to ${config.name}`);
8636
+ const fallbackMessage = `${message}\n\n[Note: A video was attached but could not be analyzed. The video analysis service may be unavailable. Please ask the user to describe the video content if needed.]`;
8637
+
8638
+ messagesForApi = fittedMessages.map((msg, idx) => {
8639
+ if (idx === fittedMessages.length - 1 && msg.role === 'user') {
8640
+ return { role: 'user', content: fallbackMessage };
8641
+ }
8642
+ return msg;
8643
+ });
8644
+ }
8645
+ } else if (hasImages) {
8646
  if (visionAnalysis) {
8647
  log.info(`🖼️ Step 2: Passing vision data to ${config.name}...`);
8648