Spaces:

Kraft102
/

widgettdc-api

Paused

File size: 6,368 Bytes

34367da

import * as cheerio from 'cheerio';
import { logger } from '../../utils/logger.js';
import { getNeo4jVectorStore } from '../../platform/vector/Neo4jVectorStoreAdapter.js';
import { getLlmService } from '../llm/llmService.js';

export interface VisualAsset {
    id: string;
    type: 'image' | 'figure' | 'chart' | 'diagram';
    sourceUrl: string;
    localPath?: string;
    description: string;
    tags: string[];
    metadata: Record<string, any>;
}

export class VisualAssetExtractor {
    
    /**

     * Extract visual assets from HTML content

     */
    async extractFromHtml(html: string, baseUrl: string): Promise<VisualAsset[]> {
        const $ = cheerio.load(html);
        const assets: VisualAsset[] = [];
        
        // Find images
        $('img').each((_, el) => {
            const src = $(el).attr('src');
            const alt = $(el).attr('alt') || '';
            
            if (src && !src.includes('icon') && !src.includes('logo') && !src.includes('pixel')) {
                // Filter out small icons/tracking pixels based on keywords for now
                // In production, check image dimensions
                try {
                    const absoluteUrl = new URL(src, baseUrl).toString();
                    assets.push({
                        id: `img-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
                        type: 'image',
                        sourceUrl: absoluteUrl,
                        description: alt,
                        tags: ['web-image'], // Initial tag
                        metadata: {
                            originalAlt: alt,
                            pageUrl: baseUrl
                        }
                    });
                } catch (e) {
                    // Invalid URL
                }
            }
        });

        // Find figures (often charts/diagrams)
        $('figure').each((_, el) => {
            const img = $(el).find('img').first();
            const caption = $(el).find('figcaption').text().trim();
            const src = img.attr('src');
            
            if (src) {
                try {
                    const absoluteUrl = new URL(src, baseUrl).toString();
                    assets.push({
                        id: `fig-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
                        type: 'figure',
                        sourceUrl: absoluteUrl,
                        description: caption || img.attr('alt') || 'Figure',
                        tags: ['figure', 'chart'],
                        metadata: {
                            caption,
                            pageUrl: baseUrl
                        }
                    });
                } catch (e) {
                    // Invalid URL
                }
            }
        });

        return assets;
    }

    /**

     * Placeholder for PDF extraction

     */
    async extractFromPdf(filePath: string): Promise<VisualAsset[]> {
        // TODO: Implement PDF image extraction using 'pdf-lib' or similar
        logger.info(`Would extract images from PDF: ${filePath}`);
        return [];
    }

    /**

     * Placeholder for PPTX extraction

     */
    async extractFromPptx(filePath: string): Promise<VisualAsset[]> {
        // TODO: Implement PPTX image extraction using 'jszip' (unzipping pptx structure)
        logger.info(`Would extract images from PPTX: ${filePath}`);
        return [];
    }

    /**

     * Analyze and label assets using Vision LLM

     */
    async analyzeAndLabel(assets: VisualAsset[]): Promise<VisualAsset[]> {
        const llm = getLlmService();
        
        for (const asset of assets) {
            try {
                // Skip if description is already detailed
                if (asset.description.length > 50) continue;

                // In a real scenario, we would pass the image URL or binary to a Vision model
                // For now, we simulate based on existing metadata/alt text
                
                const prompt = `Analyze this image metadata and generate 3 relevant tags and a short description.

                Type: ${asset.type}

                Source: ${asset.sourceUrl}

                Current Description: ${asset.description}

                Metadata: ${JSON.stringify(asset.metadata)}`;

                const analysis = await llm.generateContextualResponse(
                    "You are an expert Image Analyst. Output JSON with { description: string, tags: string[] }.",
                    prompt
                );

                try {
                    // Try to parse JSON response (LLM output might be messy)
                    const jsonMatch = analysis.match(/\{[\s\S]*\}/);
                    if (jsonMatch) {
                        const result = JSON.parse(jsonMatch[0]);
                        asset.description = result.description || asset.description;
                        asset.tags = [...new Set([...asset.tags, ...(result.tags || [])])];
                    }
                } catch (e) {
                    // Fallback if JSON parse fails
                }
            } catch (error) {
                logger.warn(`Failed to analyze asset ${asset.id}`, error);
            }
        }
        
        return assets;
    }

    /**

     * Store assets in Vector DB for retrieval

     */
    async storeAssets(assets: VisualAsset[]): Promise<void> {
        const vectorStore = getNeo4jVectorStore();
        
        await vectorStore.batchUpsert({
            records: assets.map(asset => ({
                id: asset.id,
                content: `${asset.description}\nTags: ${asset.tags.join(', ')}\nType: ${asset.type}`,
                metadata: {
                    ...asset.metadata,
                    type: 'visual_asset',
                    sourceUrl: asset.sourceUrl,
                    assetType: asset.type
                },
                namespace: 'assets' // Separate namespace for visuals
            })),
            namespace: 'assets'
        });
        
        logger.info(`🖼️ Stored ${assets.length} visual assets`);
    }
}

export const visualAssetExtractor = new VisualAssetExtractor();