Spaces:
Running
Running
| <html> | |
| <head> | |
| <title>VQA Architecture Draft</title> | |
| <script type="module"> | |
| import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs'; | |
| mermaid.initialize({ startOnLoad: true, theme: 'dark', flowchart: { curve: 'basis' } }); | |
| </script> | |
| <style> | |
| body { background-color: #0D1117; color: white; font-family: sans-serif; display: flex; justify-content: center; padding: 20px; } | |
| .mermaid { background-color: #161B22; padding: 20px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.5); } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="mermaid"> | |
| graph TD | |
| %% Styling | |
| classDef default fill:#1A1A1A,stroke:#444,stroke-width:2px,color:#FFF,rx:8px,ry:8px,font-family:arial; | |
| classDef mobile fill:#003366,stroke:#0055AA,stroke-width:2px,color:#FFF; | |
| classDef preproc fill:#333333,stroke:#555,stroke-width:2px,color:#FFF; | |
| classDef model fill:#4B0082,stroke:#8A2BE2,stroke-width:2px,color:#FFF; | |
| classDef condition fill:#2B2B2B,stroke:#F4A460,stroke-width:2px,color:#FFF,shape:rhombus; | |
| classDef external fill:#004d00,stroke:#009900,stroke-width:2px,color:#FFF; | |
| classDef final fill:#660000,stroke:#CC0000,stroke-width:2px,color:#FFF; | |
| %% Nodes | |
| UserApp[π± Mobile App]:::mobile | |
| ImgUpload[πΌοΈ Image]:::preproc | |
| Question[β¨οΈ Question Text]:::preproc | |
| PIL[π PIL Preprocessing<br/>RGB conversion]:::preproc | |
| CLIP[ποΈ OpenAI CLIP ViT-B/32<br/>Image Features 512-dim]:::model | |
| GPT2[π€ DistilGPT-2<br/>Tokenized Question]:::model | |
| Route1{Question<br/>spatial?}:::condition | |
| Spatial[π Spatial VQA Model<br/>8-head attention]:::model | |
| Base[π§ Base VQA Model<br/>General VQA]:::model | |
| Decoder[π€ GPT-2 Decoder<br/>vocab decode]:::model | |
| NeuralAns[π¬ Neural Answer]:::final | |
| Route2{Knowledge<br/>question?}:::condition | |
| ObjDet[ποΈ CLIP Object Detector<br/>Top-3 objects]:::model | |
| Wikidata[π Wikidata SPARQL<br/>P31, P186, P366]:::external | |
| GroqV[β‘ Groq Llama-3.3<br/>Verbalizer]:::external | |
| KGAns[π§© KG Enhancement]:::final | |
| FastAPI[π FastAPI]:::preproc | |
| GroqA[β‘ Groq Llama-3.3<br/>Accessibility]:::external | |
| Audio[π 2-sentence description]:::final | |
| %% Edges | |
| UserApp -- "Image uploaded" --> ImgUpload | |
| UserApp -- "Question typed" --> Question | |
| ImgUpload --> PIL | |
| PIL --> CLIP | |
| Question --> GPT2 | |
| CLIP & GPT2 --> Route1 | |
| Route1 -- "YES" --> Spatial | |
| Route1 -- "NO" --> Base | |
| Spatial & Base -- "Beam search (width=5)" --> Decoder | |
| Decoder --> NeuralAns | |
| CLIP -- "Anchor similarity" --> Route2 | |
| Route2 -- "YES" --> ObjDet | |
| ObjDet -- "Detected objects" --> Wikidata | |
| Wikidata -- "Structured facts" --> GroqV | |
| GroqV --> KGAns | |
| FastAPI -- "Narration request" --> GroqA | |
| GroqA --> Audio | |
| NeuralAns & KGAns & Audio -- "JSON output" --> FastAPI | |
| FastAPI --> UserApp | |
| </div> | |
| </body> | |
| </html> | |