Spaces:
Paused
Paused
File size: 6,368 Bytes
34367da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | import * as cheerio from 'cheerio';
import { logger } from '../../utils/logger.js';
import { getNeo4jVectorStore } from '../../platform/vector/Neo4jVectorStoreAdapter.js';
import { getLlmService } from '../llm/llmService.js';
export interface VisualAsset {
id: string;
type: 'image' | 'figure' | 'chart' | 'diagram';
sourceUrl: string;
localPath?: string;
description: string;
tags: string[];
metadata: Record<string, any>;
}
export class VisualAssetExtractor {
/**
* Extract visual assets from HTML content
*/
async extractFromHtml(html: string, baseUrl: string): Promise<VisualAsset[]> {
const $ = cheerio.load(html);
const assets: VisualAsset[] = [];
// Find images
$('img').each((_, el) => {
const src = $(el).attr('src');
const alt = $(el).attr('alt') || '';
if (src && !src.includes('icon') && !src.includes('logo') && !src.includes('pixel')) {
// Filter out small icons/tracking pixels based on keywords for now
// In production, check image dimensions
try {
const absoluteUrl = new URL(src, baseUrl).toString();
assets.push({
id: `img-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
type: 'image',
sourceUrl: absoluteUrl,
description: alt,
tags: ['web-image'], // Initial tag
metadata: {
originalAlt: alt,
pageUrl: baseUrl
}
});
} catch (e) {
// Invalid URL
}
}
});
// Find figures (often charts/diagrams)
$('figure').each((_, el) => {
const img = $(el).find('img').first();
const caption = $(el).find('figcaption').text().trim();
const src = img.attr('src');
if (src) {
try {
const absoluteUrl = new URL(src, baseUrl).toString();
assets.push({
id: `fig-${Date.now()}-${Math.random().toString(36).substr(2, 5)}`,
type: 'figure',
sourceUrl: absoluteUrl,
description: caption || img.attr('alt') || 'Figure',
tags: ['figure', 'chart'],
metadata: {
caption,
pageUrl: baseUrl
}
});
} catch (e) {
// Invalid URL
}
}
});
return assets;
}
/**
* Placeholder for PDF extraction
*/
async extractFromPdf(filePath: string): Promise<VisualAsset[]> {
// TODO: Implement PDF image extraction using 'pdf-lib' or similar
logger.info(`Would extract images from PDF: ${filePath}`);
return [];
}
/**
* Placeholder for PPTX extraction
*/
async extractFromPptx(filePath: string): Promise<VisualAsset[]> {
// TODO: Implement PPTX image extraction using 'jszip' (unzipping pptx structure)
logger.info(`Would extract images from PPTX: ${filePath}`);
return [];
}
/**
* Analyze and label assets using Vision LLM
*/
async analyzeAndLabel(assets: VisualAsset[]): Promise<VisualAsset[]> {
const llm = getLlmService();
for (const asset of assets) {
try {
// Skip if description is already detailed
if (asset.description.length > 50) continue;
// In a real scenario, we would pass the image URL or binary to a Vision model
// For now, we simulate based on existing metadata/alt text
const prompt = `Analyze this image metadata and generate 3 relevant tags and a short description.
Type: ${asset.type}
Source: ${asset.sourceUrl}
Current Description: ${asset.description}
Metadata: ${JSON.stringify(asset.metadata)}`;
const analysis = await llm.generateContextualResponse(
"You are an expert Image Analyst. Output JSON with { description: string, tags: string[] }.",
prompt
);
try {
// Try to parse JSON response (LLM output might be messy)
const jsonMatch = analysis.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const result = JSON.parse(jsonMatch[0]);
asset.description = result.description || asset.description;
asset.tags = [...new Set([...asset.tags, ...(result.tags || [])])];
}
} catch (e) {
// Fallback if JSON parse fails
}
} catch (error) {
logger.warn(`Failed to analyze asset ${asset.id}`, error);
}
}
return assets;
}
/**
* Store assets in Vector DB for retrieval
*/
async storeAssets(assets: VisualAsset[]): Promise<void> {
const vectorStore = getNeo4jVectorStore();
await vectorStore.batchUpsert({
records: assets.map(asset => ({
id: asset.id,
content: `${asset.description}\nTags: ${asset.tags.join(', ')}\nType: ${asset.type}`,
metadata: {
...asset.metadata,
type: 'visual_asset',
sourceUrl: asset.sourceUrl,
assetType: asset.type
},
namespace: 'assets' // Separate namespace for visuals
})),
namespace: 'assets'
});
logger.info(`🖼️ Stored ${assets.length} visual assets`);
}
}
export const visualAssetExtractor = new VisualAssetExtractor();
|