|
|
import { readdir, readFile, stat } from 'fs/promises'; |
|
|
import { join } from 'path'; |
|
|
import { fileURLToPath } from 'url'; |
|
|
import { dirname } from 'path'; |
|
|
import { createClient } from '@supabase/supabase-js'; |
|
|
import { GoogleGenerativeAI } from '@google/generative-ai'; |
|
|
import { config, validateConfig } from '../utils/config.js'; |
|
|
import { logger } from '../utils/logger.js'; |
|
|
|
|
|
const __filename = fileURLToPath(import.meta.url); |
|
|
const __dirname = dirname(__filename); |
|
|
|
|
|
|
|
|
const SUPABASE_URL = config.supabase.url; |
|
|
const SUPABASE_SERVICE_KEY = config.supabase.serviceKey; |
|
|
const GEMINI_API_KEY = config.gemini.apiKey; |
|
|
|
|
|
const SPECIALTY_FOLDER = 'Da liễu'; |
|
|
const SPECIALTY_LABEL = slugToLabel(SPECIALTY_FOLDER); |
|
|
|
|
|
|
|
|
const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_KEY, { |
|
|
auth: { persistSession: false }, |
|
|
}); |
|
|
|
|
|
const genAI = new GoogleGenerativeAI(GEMINI_API_KEY); |
|
|
const embedModel = genAI.getGenerativeModel({ model: 'text-embedding-004' }); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function slugToLabel(slug: string): string { |
|
|
let label = slug.replace(/[-_]+/g, ' '); |
|
|
label = label.replace(/\s+/g, ' ').trim(); |
|
|
|
|
|
|
|
|
label = label.replace(/^(CH(U|Ư)ƠNG)\s*\d+\s*[.:~-]?\s*/iu, ''); |
|
|
|
|
|
|
|
|
label = label.replace(/^(?:\(?[0-9IVXLCDM]+\)?)(?:\s*[\.\-])?\s+/iu, ''); |
|
|
|
|
|
|
|
|
label = label.replace(/\s*:\s*/g, ': '); |
|
|
label = label.replace(/\s*-\s*/g, ' - '); |
|
|
label = label.replace(/\s+\./g, '. '); |
|
|
label = label.replace(/,\s*/g, ', '); |
|
|
label = label.replace(/\s{2,}/g, ' ').trim(); |
|
|
|
|
|
if (!label) { |
|
|
label = slug.replace(/[-_]+/g, ' ').trim(); |
|
|
} |
|
|
|
|
|
return label; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function parseSectionFileName(filename: string): { title: string } | null { |
|
|
if (!filename.endsWith('.txt')) return null; |
|
|
const base = filename.replace(/\.txt$/i, ''); |
|
|
const withoutDuplicateSuffix = base.replace(/_(\d+)$/, ''); |
|
|
const title = withoutDuplicateSuffix.replace(/[_]+/g, ' ').trim(); |
|
|
if (!title) return null; |
|
|
return { title }; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function embed(text: string): Promise<number[]> { |
|
|
const res = await embedModel.embedContent(text); |
|
|
return res.embedding.values; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function isDirectory(path: string): Promise<boolean> { |
|
|
try { |
|
|
const stats = await stat(path); |
|
|
return stats.isDirectory(); |
|
|
} catch { |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function getOrCreateSpecialty(name: string): Promise<string | null> { |
|
|
|
|
|
const { data: existing, error: fetchError } = await supabase |
|
|
.from('specialties') |
|
|
.select('id') |
|
|
.eq('name', name) |
|
|
.single(); |
|
|
|
|
|
if (existing) { |
|
|
return existing.id; |
|
|
} |
|
|
|
|
|
if (fetchError && fetchError.code !== 'PGRST116') { |
|
|
logger.error({ error: fetchError }, `Error fetching specialty: ${name}`); |
|
|
return null; |
|
|
} |
|
|
|
|
|
|
|
|
const { data: created, error: createError } = await supabase |
|
|
.from('specialties') |
|
|
.insert({ name }) |
|
|
.select('id') |
|
|
.single(); |
|
|
|
|
|
if (createError) { |
|
|
logger.error({ error: createError }, `Error creating specialty: ${name}`); |
|
|
return null; |
|
|
} |
|
|
|
|
|
return created?.id || null; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function getOrCreateDisease( |
|
|
name: string, |
|
|
specialtyId: string |
|
|
): Promise<string | null> { |
|
|
|
|
|
const { data: existing, error: fetchError } = await supabase |
|
|
.from('diseases') |
|
|
.select('id') |
|
|
.eq('name', name) |
|
|
.eq('specialty_id', specialtyId) |
|
|
.single(); |
|
|
|
|
|
if (existing) { |
|
|
return existing.id; |
|
|
} |
|
|
|
|
|
if (fetchError && fetchError.code !== 'PGRST116') { |
|
|
logger.error({ error: fetchError }, `Error fetching disease: ${name}`); |
|
|
return null; |
|
|
} |
|
|
|
|
|
|
|
|
const { data: created, error: createError } = await supabase |
|
|
.from('diseases') |
|
|
.insert({ name, specialty_id: specialtyId }) |
|
|
.select('id') |
|
|
.single(); |
|
|
|
|
|
if (createError) { |
|
|
logger.error({ error: createError }, `Error creating disease: ${name}`); |
|
|
return null; |
|
|
} |
|
|
|
|
|
return created?.id || null; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function getInfoDomainId(name: string): Promise<string | null> { |
|
|
const { data, error } = await supabase |
|
|
.from('info_domains') |
|
|
.select('id') |
|
|
.eq('name', name) |
|
|
.single(); |
|
|
|
|
|
if (error) { |
|
|
|
|
|
const normalizedName = name.toLowerCase().trim(); |
|
|
const { data: allDomains } = await supabase |
|
|
.from('info_domains') |
|
|
.select('id, name'); |
|
|
|
|
|
if (allDomains) { |
|
|
for (const domain of allDomains) { |
|
|
if (domain.name.toLowerCase().includes(normalizedName) || |
|
|
normalizedName.includes(domain.name.toLowerCase())) { |
|
|
return domain.id; |
|
|
} |
|
|
} |
|
|
} |
|
|
return null; |
|
|
} |
|
|
|
|
|
return data?.id || null; |
|
|
} |
|
|
|
|
|
async function seedGuidelines() { |
|
|
try { |
|
|
logger.info(`Starting medical knowledge seeding for ${SPECIALTY_LABEL}...`); |
|
|
logger.info(`⚠️ This will seed ALL diseases in the ${SPECIALTY_FOLDER} folder.`); |
|
|
|
|
|
validateConfig(); |
|
|
|
|
|
|
|
|
const dataRoot = join(__dirname, '../../data'); |
|
|
const specialtyRoot = join(dataRoot, SPECIALTY_FOLDER); |
|
|
|
|
|
logger.info(`Reading data from: ${specialtyRoot}`); |
|
|
|
|
|
|
|
|
try { |
|
|
await stat(specialtyRoot); |
|
|
} catch { |
|
|
logger.error(`Specialty folder not found: ${specialtyRoot}`); |
|
|
process.exit(1); |
|
|
} |
|
|
|
|
|
|
|
|
const specialtyId = await getOrCreateSpecialty(SPECIALTY_LABEL); |
|
|
if (!specialtyId) { |
|
|
logger.error('Failed to get or create specialty record'); |
|
|
process.exit(1); |
|
|
} |
|
|
logger.info(`Specialty ID: ${specialtyId}`); |
|
|
|
|
|
|
|
|
const chapterEntries = await readdir(specialtyRoot); |
|
|
let totalSeeded = 0; |
|
|
let totalSkipped = 0; |
|
|
let totalDuplicates = 0; |
|
|
|
|
|
|
|
|
for (const chapterSlug of chapterEntries) { |
|
|
const chapterPath = join(specialtyRoot, chapterSlug); |
|
|
|
|
|
if (!(await isDirectory(chapterPath))) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
const chapterLabel = slugToLabel(chapterSlug); |
|
|
logger.info(`\n📖 Processing chapter: ${chapterLabel}`); |
|
|
|
|
|
|
|
|
const diseaseEntries = await readdir(chapterPath); |
|
|
|
|
|
|
|
|
for (const diseaseSlug of diseaseEntries) { |
|
|
const diseasePath = join(chapterPath, diseaseSlug); |
|
|
|
|
|
if (!(await isDirectory(diseasePath))) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
const diseaseLabel = slugToLabel(diseaseSlug); |
|
|
logger.info(` 🩺 Processing disease: ${diseaseLabel}`); |
|
|
|
|
|
|
|
|
const sectionFiles = await readdir(diseasePath); |
|
|
|
|
|
for (const filename of sectionFiles) { |
|
|
|
|
|
if (!filename.endsWith('.txt') || filename === '_raw.txt') { |
|
|
continue; |
|
|
} |
|
|
|
|
|
const sectionInfo = parseSectionFileName(filename); |
|
|
if (!sectionInfo) { |
|
|
logger.warn(` ⚠️ Skipping invalid filename: ${filename}`); |
|
|
totalSkipped++; |
|
|
continue; |
|
|
} |
|
|
|
|
|
const { title: sectionTitle } = sectionInfo; |
|
|
const sectionPath = join(diseasePath, filename); |
|
|
|
|
|
try { |
|
|
const content = await readFile(sectionPath, 'utf-8'); |
|
|
const contentTrimmed = content.trim(); |
|
|
|
|
|
if (!contentTrimmed) { |
|
|
logger.warn(` ⚠️ Skipping empty file: ${filename}`); |
|
|
totalSkipped++; |
|
|
continue; |
|
|
} |
|
|
|
|
|
const relativePath = sectionPath.replace(dataRoot + '/', ''); |
|
|
|
|
|
logger.info(` 📄 ${sectionTitle} (${contentTrimmed.length} chars)`); |
|
|
|
|
|
|
|
|
const diseaseId = await getOrCreateDisease(diseaseLabel, specialtyId); |
|
|
|
|
|
|
|
|
const infoDomainId = await getInfoDomainId(sectionTitle); |
|
|
|
|
|
|
|
|
const embedding = await embed(contentTrimmed); |
|
|
|
|
|
|
|
|
const { data: existingMedicalChunk } = await supabase |
|
|
.from('medical_knowledge_chunks') |
|
|
.select('id') |
|
|
.eq('path', relativePath) |
|
|
.single(); |
|
|
|
|
|
let isNewMedicalChunk = false; |
|
|
if (existingMedicalChunk) { |
|
|
logger.info(` ⏭️ Skipping duplicate medical chunk: ${sectionTitle} (already exists)`); |
|
|
totalDuplicates++; |
|
|
|
|
|
} else { |
|
|
isNewMedicalChunk = true; |
|
|
|
|
|
const { error: error1 } = await supabase.from('medical_knowledge_chunks').insert({ |
|
|
specialty_id: specialtyId, |
|
|
disease_id: diseaseId, |
|
|
info_domain_id: infoDomainId, |
|
|
specialty: SPECIALTY_LABEL, |
|
|
chapter: chapterLabel, |
|
|
disease: diseaseLabel, |
|
|
section_title: sectionTitle, |
|
|
content: contentTrimmed, |
|
|
path: relativePath, |
|
|
embedding, |
|
|
}); |
|
|
|
|
|
if (error1) { |
|
|
logger.error({ error: error1.message }, ` ❌ Error inserting structured chunk: ${sectionTitle}`); |
|
|
totalSkipped++; |
|
|
continue; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
let guidelineId; |
|
|
const { data: existingGuideline } = await supabase |
|
|
.from('guidelines') |
|
|
.select('id') |
|
|
.eq('condition', diseaseLabel) |
|
|
.eq('source', sectionTitle) |
|
|
.single(); |
|
|
|
|
|
if (existingGuideline) { |
|
|
guidelineId = existingGuideline.id; |
|
|
} else { |
|
|
const { data: newGuideline, error: gError } = await supabase |
|
|
.from('guidelines') |
|
|
.insert({ |
|
|
condition: diseaseLabel, |
|
|
source: sectionTitle, |
|
|
updated_at: new Date().toISOString() |
|
|
}) |
|
|
.select('id') |
|
|
.single(); |
|
|
|
|
|
if (gError || !newGuideline) { |
|
|
logger.warn(` ⚠️ Failed to create guideline record: ${gError?.message}`); |
|
|
} else { |
|
|
guidelineId = newGuideline.id; |
|
|
} |
|
|
} |
|
|
|
|
|
if (guidelineId) { |
|
|
|
|
|
const { data: existingGuidelineChunk } = await supabase |
|
|
.from('guideline_chunks') |
|
|
.select('id') |
|
|
.eq('guideline_id', guidelineId) |
|
|
.eq('content', contentTrimmed) |
|
|
.single(); |
|
|
|
|
|
if (!existingGuidelineChunk) { |
|
|
|
|
|
const { error: chunkError } = await supabase.from('guideline_chunks').insert({ |
|
|
guideline_id: guidelineId, |
|
|
content: contentTrimmed, |
|
|
embedding, |
|
|
metadata: { |
|
|
chapter: chapterLabel, |
|
|
specialty: SPECIALTY_LABEL |
|
|
} |
|
|
}); |
|
|
|
|
|
if (chunkError) { |
|
|
logger.warn(` ⚠️ Failed to insert guideline chunk: ${chunkError.message}`); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (isNewMedicalChunk) { |
|
|
totalSeeded++; |
|
|
} |
|
|
|
|
|
|
|
|
} catch (error) { |
|
|
logger.error({ error }, ` ❌ Error processing file: ${filename}`); |
|
|
totalSkipped++; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
logger.info(`\n✅ Seeding completed!`); |
|
|
logger.info(` 📊 Total seeded: ${totalSeeded}`); |
|
|
logger.info(` ⏭️ Total duplicates (skipped): ${totalDuplicates}`); |
|
|
logger.info(` ⚠️ Total skipped (errors): ${totalSkipped}`); |
|
|
|
|
|
} catch (error) { |
|
|
if (error instanceof Error) { |
|
|
logger.error({ error: error.message, stack: error.stack }, 'Seeding failed'); |
|
|
} else { |
|
|
logger.error({ error: JSON.stringify(error) }, 'Seeding failed'); |
|
|
} |
|
|
process.exit(1); |
|
|
} |
|
|
} |
|
|
|
|
|
seedGuidelines(); |
|
|
|