// Chunk a document's raw_text and embed via Lovable AI Gateway (OpenAI text-embedding-3-small, 1536d). import { createClient } from "https://esm.sh/@supabase/supabase-js@2.57.4"; const corsHeaders = { "Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type, x-supabase-client-platform, x-supabase-client-platform-version, x-supabase-client-runtime, x-supabase-client-runtime-version", }; const EMBED_DIMS = 1536; const TARGET_CHARS = 1400; // ~350 tokens const OVERLAP = 200; const BATCH_SIZE = 32; // FNV-1a 32-bit function fnv1a(str: string): number { let h = 0x811c9dc5; for (let i = 0; i < str.length; i++) { h ^= str.charCodeAt(i); h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0; } return h >>> 0; } function tokenize(text: string): string[] { return text .toLowerCase() .replace(/[^a-z0-9\s]/g, " ") .split(/\s+/) .filter((t) => t.length >= 2 && t.length <= 40); } // Deterministic local embedding: hashed unigrams + bigrams, L2-normalized, 1536d. // No external API needed — stable across embed_chunks and chat query embedding. export function embedLocal(text: string): number[] { const v = new Float64Array(EMBED_DIMS); const toks = tokenize(text); if (toks.length === 0) return Array.from(v); const add = (term: string) => { const h = fnv1a(term); const idx = h % EMBED_DIMS; const sign = (h >>> 31) & 1 ? -1 : 1; v[idx] += sign; }; for (let i = 0; i < toks.length; i++) { add(toks[i]); if (i + 1 < toks.length) add(toks[i] + "_" + toks[i + 1]); } let norm = 0; for (let i = 0; i < EMBED_DIMS; i++) norm += v[i] * v[i]; norm = Math.sqrt(norm) || 1; const out = new Array(EMBED_DIMS); for (let i = 0; i < EMBED_DIMS; i++) out[i] = v[i] / norm; return out; } function chunkText(raw: string | null | undefined): string[] { const text = (raw ?? "").replace(/\s+/g, " ").trim(); if (!text) return []; const out: string[] = []; let i = 0; while (i < text.length) { const end = Math.min(text.length, i + TARGET_CHARS); out.push(text.slice(i, end).trim()); if (end >= text.length) break; i = end - OVERLAP; if (i < 0) i = 0; } return out.filter((c) => c.length > 0); } Deno.serve(async (req) => { if (req.method === "OPTIONS") return new Response("ok", { headers: corsHeaders }); const authHeader = req.headers.get("Authorization"); if (!authHeader?.startsWith("Bearer ")) { return new Response(JSON.stringify({ error: "Unauthorized" }), { status: 401, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } const SUPABASE_URL = Deno.env.get("SUPABASE_URL")!; const ANON = Deno.env.get("SUPABASE_ANON_KEY")!; const SERVICE = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!; // Note: embeddings are computed locally (deterministic hashed n-grams), so no AI key required. const userClient = createClient(SUPABASE_URL, ANON, { global: { headers: { Authorization: authHeader } }, }); const admin = createClient(SUPABASE_URL, SERVICE); const token = authHeader.replace("Bearer ", ""); const { data: { user }, error: userErr } = await userClient.auth.getUser(token); if (userErr || !user) { const msg = userErr ? userErr.message : "User not found"; return new Response(JSON.stringify({ error: "Unauthorized", details: msg }), { status: 401, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } const userId = user.id; let body: { document_id?: string }; try { body = await req.json(); } catch { return new Response(JSON.stringify({ error: "Invalid JSON" }), { status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } const documentId = body.document_id; if (!documentId) { return new Response(JSON.stringify({ error: "document_id required" }), { status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } const { data: doc } = await admin .from("documents") .select("id,user_id,raw_text,status") .eq("id", documentId) .maybeSingle(); if (!doc || doc.user_id !== userId) { return new Response(JSON.stringify({ error: "Document not found" }), { status: 404, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } if (!doc.raw_text || doc.raw_text.trim().length < 40) { return new Response(JSON.stringify({ error: "Document has no text yet" }), { status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } // Skip if already embedded. const { count: existing } = await admin .from("document_chunks") .select("id", { count: "exact", head: true }) .eq("document_id", documentId); if ((existing ?? 0) > 0) { return new Response(JSON.stringify({ ok: true, chunks: existing, cached: true }), { headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } const { data: job } = await admin .from("jobs") .insert({ user_id: userId, document_id: documentId, kind: "embed_chunks", status: "running", progress: 5 }) .select("id") .single(); const jobId = job?.id; try { const chunks = chunkText(doc.raw_text); if (chunks.length === 0) throw new Error("no_chunks"); let inserted = 0; for (let i = 0; i < chunks.length; i += BATCH_SIZE) { const batch = chunks.slice(i, i + BATCH_SIZE); const vectors = batch.map((c: string) => embedLocal(c)); const rows = batch.map((chunk_text: string, j: number) => ({ user_id: userId, document_id: documentId, chunk_text, order_index: i + j, embedding: `[${vectors[j].join(",")}]`, })); const { error: insErr } = await admin.from("document_chunks").insert(rows); if (insErr) throw new Error("insert_failed:" + insErr.message); inserted += rows.length; if (jobId) { await admin.from("jobs").update({ progress: Math.min(95, Math.round((inserted / chunks.length) * 95)), }).eq("id", jobId); } } if (jobId) await admin.from("jobs").update({ status: "succeeded", progress: 100 }).eq("id", jobId); return new Response(JSON.stringify({ ok: true, chunks: inserted }), { headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } catch (e) { const msg = e instanceof Error ? e.message : String(e); console.error("embed_chunks error", msg); if (jobId) await admin.from("jobs").update({ status: "failed", error: msg.slice(0, 200), progress: 100 }).eq("id", jobId); if (msg.includes("429")) { return new Response(JSON.stringify({ error: "Rate limit exceeded" }), { status: 429, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } if (msg.includes("402")) { return new Response(JSON.stringify({ error: "Out of credits" }), { status: 402, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } return new Response(JSON.stringify({ error: msg }), { status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" }, }); } });