File size: 6,368 Bytes
34367da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import * as cheerio from 'cheerio';
import { logger } from '../../utils/logger.js';
import { getNeo4jVectorStore } from '../../platform/vector/Neo4jVectorStoreAdapter.js';
import { getLlmService } from '../llm/llmService.js';

export interface VisualAsset {
    id: string;
    type: 'image' | 'figure' | 'chart' | 'diagram';
    sourceUrl: string;
    localPath?: string;
    description: string;
    tags: string[];
    metadata: Record<string, any>;
}

export class VisualAssetExtractor {
    
    /**

     * Extract visual assets from HTML content

     */
    async extractFromHtml(html: string, baseUrl: string): Promise<VisualAsset[]> {
        const $ = cheerio.load(html);
        const assets: VisualAsset[] = [];
        
        // Find images
        $('img').each((_, el) => {
            const src = $(el).attr('src');
            const alt = $(el).attr('alt') || '';
            
            if (src && !src.includes('icon') && !src.includes('logo') && !src.includes('pixel')) {
                // Filter out small icons/tracking pixels based on keywords for now
                // In production, check image dimensions
                try {
                    const absoluteUrl = new URL(src, baseUrl).toString();
                    assets.push({
                        id: `img-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
                        type: 'image',
                        sourceUrl: absoluteUrl,
                        description: alt,
                        tags: ['web-image'], // Initial tag
                        metadata: {
                            originalAlt: alt,
                            pageUrl: baseUrl
                        }
                    });
                } catch (e) {
                    // Invalid URL
                }
            }
        });

        // Find figures (often charts/diagrams)
        $('figure').each((_, el) => {
            const img = $(el).find('img').first();
            const caption = $(el).find('figcaption').text().trim();
            const src = img.attr('src');
            
            if (src) {
                try {
                    const absoluteUrl = new URL(src, baseUrl).toString();
                    assets.push({
                        id: `fig-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
                        type: 'figure',
                        sourceUrl: absoluteUrl,
                        description: caption || img.attr('alt') || 'Figure',
                        tags: ['figure', 'chart'],
                        metadata: {
                            caption,
                            pageUrl: baseUrl
                        }
                    });
                } catch (e) {
                    // Invalid URL
                }
            }
        });

        return assets;
    }

    /**

     * Placeholder for PDF extraction

     */
    async extractFromPdf(filePath: string): Promise<VisualAsset[]> {
        // TODO: Implement PDF image extraction using 'pdf-lib' or similar
        logger.info(`Would extract images from PDF: ${filePath}`);
        return [];
    }

    /**

     * Placeholder for PPTX extraction

     */
    async extractFromPptx(filePath: string): Promise<VisualAsset[]> {
        // TODO: Implement PPTX image extraction using 'jszip' (unzipping pptx structure)
        logger.info(`Would extract images from PPTX: ${filePath}`);
        return [];
    }

    /**

     * Analyze and label assets using Vision LLM

     */
    async analyzeAndLabel(assets: VisualAsset[]): Promise<VisualAsset[]> {
        const llm = getLlmService();
        
        for (const asset of assets) {
            try {
                // Skip if description is already detailed
                if (asset.description.length > 50) continue;

                // In a real scenario, we would pass the image URL or binary to a Vision model
                // For now, we simulate based on existing metadata/alt text
                
                const prompt = `Analyze this image metadata and generate 3 relevant tags and a short description.

                Type: ${asset.type}

                Source: ${asset.sourceUrl}

                Current Description: ${asset.description}

                Metadata: ${JSON.stringify(asset.metadata)}`;

                const analysis = await llm.generateContextualResponse(
                    "You are an expert Image Analyst. Output JSON with { description: string, tags: string[] }.",
                    prompt
                );

                try {
                    // Try to parse JSON response (LLM output might be messy)
                    const jsonMatch = analysis.match(/\{[\s\S]*\}/);
                    if (jsonMatch) {
                        const result = JSON.parse(jsonMatch[0]);
                        asset.description = result.description || asset.description;
                        asset.tags = [...new Set([...asset.tags, ...(result.tags || [])])];
                    }
                } catch (e) {
                    // Fallback if JSON parse fails
                }
            } catch (error) {
                logger.warn(`Failed to analyze asset ${asset.id}`, error);
            }
        }
        
        return assets;
    }

    /**

     * Store assets in Vector DB for retrieval

     */
    async storeAssets(assets: VisualAsset[]): Promise<void> {
        const vectorStore = getNeo4jVectorStore();
        
        await vectorStore.batchUpsert({
            records: assets.map(asset => ({
                id: asset.id,
                content: `${asset.description}\nTags: ${asset.tags.join(', ')}\nType: ${asset.type}`,
                metadata: {
                    ...asset.metadata,
                    type: 'visual_asset',
                    sourceUrl: asset.sourceUrl,
                    assetType: asset.type
                },
                namespace: 'assets' // Separate namespace for visuals
            })),
            namespace: 'assets'
        });
        
        logger.info(`🖼️ Stored ${assets.length} visual assets`);
    }
}

export const visualAssetExtractor = new VisualAssetExtractor();