Spaces:

Medagen
/

Backend

Sleeping

App Files Files Community

Backend / src /scripts /seed-guidelines.ts

Cuong2004

init

d4abe4b 2 months ago

raw

history blame contribute delete

13.4 kB

	import { readdir, readFile, stat } from 'fs/promises';
	import { join } from 'path';
	import { fileURLToPath } from 'url';
	import { dirname } from 'path';
	import { createClient } from '@supabase/supabase-js';
	import { GoogleGenerativeAI } from '@google/generative-ai';
	import { config, validateConfig } from '../utils/config.js';
	import { logger } from '../utils/logger.js';

	const __filename = fileURLToPath(import.meta.url);
	const __dirname = dirname(__filename);

	// ===================== CONFIG =====================
	const SUPABASE_URL = config.supabase.url;
	const SUPABASE_SERVICE_KEY = config.supabase.serviceKey;
	const GEMINI_API_KEY = config.gemini.apiKey;

	const SPECIALTY_FOLDER = 'Da liễu'; // Can be changed: "Than-kinh", etc.
	const SPECIALTY_LABEL = slugToLabel(SPECIALTY_FOLDER);

	// ===================== INIT CLIENTS =====================
	const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_KEY, {
	auth: { persistSession: false },
	});

	const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);
	const embedModel = genAI.getGenerativeModel({ model: 'text-embedding-004' });

	// ===================== HELPER FUNCTIONS =====================

	/**
	* Convert slug to human-friendly label
	*/
	function slugToLabel(slug: string): string {
	let label = slug.replace(/[-_]+/g, ' ');
	label = label.replace(/\s+/g, ' ').trim();

	// Remove "CHƯƠNG {n}" prefix entirely
	label = label.replace(/^(CH(U\|Ư)ƠNG)\s\d+\s[.:~-]?\s*/iu, '');

	// Remove leading numeric / roman numeral prefixes (e.g., "1.", "(IV)", "2-")
	label = label.replace(/^(?:$?[0-9IVXLCDM]+$?)(?:\s*[\.\-])?\s+/iu, '');

	// Cleanup remaining punctuation/spacing
	label = label.replace(/\s:\s/g, ': ');
	label = label.replace(/\s-\s/g, ' - ');
	label = label.replace(/\s+\./g, '. ');
	label = label.replace(/,\s*/g, ', ');
	label = label.replace(/\s{2,}/g, ' ').trim();

	if (!label) {
	label = slug.replace(/[-_]+/g, ' ').trim();
	}

	return label;
	}

	/**
	* Parse section filename to human-readable title.
	* Supports formats like:
	* - "ĐẠI CƯƠNG.txt"
	* - "ĐẠI CƯƠNG_1.txt" (duplicate-safe suffix)
	*/
	function parseSectionFileName(filename: string): { title: string } \| null {
	if (!filename.endsWith('.txt')) return null;
	const base = filename.replace(/\.txt$/i, '');
	const withoutDuplicateSuffix = base.replace(/_(\d+)$/, '');
	const title = withoutDuplicateSuffix.replace(/[_]+/g, ' ').trim();
	if (!title) return null;
	return { title };
	}

	/**
	* Generate embedding using Gemini
	*/
	async function embed(text: string): Promise<number[]> {
	const res = await embedModel.embedContent(text);
	return res.embedding.values;
	}

	/**
	* Check if a directory entry is a directory
	*/
	async function isDirectory(path: string): Promise<boolean> {
	try {
	const stats = await stat(path);
	return stats.isDirectory();
	} catch {
	return false;
	}
	}

	/**
	* Get or create specialty record
	*/
	async function getOrCreateSpecialty(name: string): Promise<string \| null> {
	// Try to find existing specialty
	const { data: existing, error: fetchError } = await supabase
	.from('specialties')
	.select('id')
	.eq('name', name)
	.single();

	if (existing) {
	return existing.id;
	}

	if (fetchError && fetchError.code !== 'PGRST116') {
	logger.error({ error: fetchError }, `Error fetching specialty: ${name}`);
	return null;
	}

	// Create new specialty
	const { data: created, error: createError } = await supabase
	.from('specialties')
	.insert({ name })
	.select('id')
	.single();

	if (createError) {
	logger.error({ error: createError }, `Error creating specialty: ${name}`);
	return null;
	}

	return created?.id \|\| null;
	}

	/**
	* Get or create disease record
	*/
	async function getOrCreateDisease(
	name: string,
	specialtyId: string
	): Promise<string \| null> {
	// Try to find existing disease
	const { data: existing, error: fetchError } = await supabase
	.from('diseases')
	.select('id')
	.eq('name', name)
	.eq('specialty_id', specialtyId)
	.single();

	if (existing) {
	return existing.id;
	}

	if (fetchError && fetchError.code !== 'PGRST116') {
	logger.error({ error: fetchError }, `Error fetching disease: ${name}`);
	return null;
	}

	// Create new disease
	const { data: created, error: createError } = await supabase
	.from('diseases')
	.insert({ name, specialty_id: specialtyId })
	.select('id')
	.single();

	if (createError) {
	logger.error({ error: createError }, `Error creating disease: ${name}`);
	return null;
	}

	return created?.id \|\| null;
	}

	/**
	* Get info domain ID by name
	*/
	async function getInfoDomainId(name: string): Promise<string \| null> {
	const { data, error } = await supabase
	.from('info_domains')
	.select('id')
	.eq('name', name)
	.single();

	if (error) {
	// Try fuzzy match
	const normalizedName = name.toLowerCase().trim();
	const { data: allDomains } = await supabase
	.from('info_domains')
	.select('id, name');

	if (allDomains) {
	for (const domain of allDomains) {
	if (domain.name.toLowerCase().includes(normalizedName) \|\|
	normalizedName.includes(domain.name.toLowerCase())) {
	return domain.id;
	}
	}
	}
	return null;
	}

	return data?.id \|\| null;
	}

	async function seedGuidelines() {
	try {
	logger.info(`Starting medical knowledge seeding for ${SPECIALTY_LABEL}...`);
	logger.info(`⚠️ This will seed ALL diseases in the ${SPECIALTY_FOLDER} folder.`);

	validateConfig();

	// Get data folder path (relative to project root)
	const dataRoot = join(__dirname, '../../data');
	const specialtyRoot = join(dataRoot, SPECIALTY_FOLDER);

	logger.info(`Reading data from: ${specialtyRoot}`);

	// Check if specialty folder exists
	try {
	await stat(specialtyRoot);
	} catch {
	logger.error(`Specialty folder not found: ${specialtyRoot}`);
	process.exit(1);
	}

	// Get or create specialty record
	const specialtyId = await getOrCreateSpecialty(SPECIALTY_LABEL);
	if (!specialtyId) {
	logger.error('Failed to get or create specialty record');
	process.exit(1);
	}
	logger.info(`Specialty ID: ${specialtyId}`);

	// Read chapter directories
	const chapterEntries = await readdir(specialtyRoot);
	let totalSeeded = 0;
	let totalSkipped = 0;
	let totalDuplicates = 0;

	// Process ALL chapters (no filtering)
	for (const chapterSlug of chapterEntries) {
	const chapterPath = join(specialtyRoot, chapterSlug);

	if (!(await isDirectory(chapterPath))) {
	continue;
	}

	const chapterLabel = slugToLabel(chapterSlug);
	logger.info(`\n📖 Processing chapter: ${chapterLabel}`);

	// Read disease directories
	const diseaseEntries = await readdir(chapterPath);

	// Process ALL diseases in this chapter (no filtering)
	for (const diseaseSlug of diseaseEntries) {
	const diseasePath = join(chapterPath, diseaseSlug);

	if (!(await isDirectory(diseasePath))) {
	continue;
	}

	const diseaseLabel = slugToLabel(diseaseSlug);
	logger.info(` 🩺 Processing disease: ${diseaseLabel}`);

	// Read section files
	const sectionFiles = await readdir(diseasePath);

	for (const filename of sectionFiles) {
	// Skip non-txt files and _raw.txt
	if (!filename.endsWith('.txt') \|\| filename === '_raw.txt') {
	continue;
	}

	const sectionInfo = parseSectionFileName(filename);
	if (!sectionInfo) {
	logger.warn(` ⚠️ Skipping invalid filename: ${filename}`);
	totalSkipped++;
	continue;
	}

	const { title: sectionTitle } = sectionInfo;
	const sectionPath = join(diseasePath, filename);

	try {
	const content = await readFile(sectionPath, 'utf-8');
	const contentTrimmed = content.trim();

	if (!contentTrimmed) {
	logger.warn(` ⚠️ Skipping empty file: ${filename}`);
	totalSkipped++;
	continue;
	}

	const relativePath = sectionPath.replace(dataRoot + '/', '');

	logger.info(` 📄 ${sectionTitle} (${contentTrimmed.length} chars)`);

	// Get or create disease record
	const diseaseId = await getOrCreateDisease(diseaseLabel, specialtyId);

	// Get info domain ID (match section title with info domain)
	const infoDomainId = await getInfoDomainId(sectionTitle);

	// Generate embedding
	const embedding = await embed(contentTrimmed);

	// Check if this chunk already exists (by path) to avoid duplicates
	const { data: existingMedicalChunk } = await supabase
	.from('medical_knowledge_chunks')
	.select('id')
	.eq('path', relativePath)
	.single();

	let isNewMedicalChunk = false;
	if (existingMedicalChunk) {
	logger.info(` ⏭️ Skipping duplicate medical chunk: ${sectionTitle} (already exists)`);
	totalDuplicates++;
	// Still insert into guideline_chunks for RAG compatibility
	} else {
	isNewMedicalChunk = true;
	// 1. Insert into medical_knowledge_chunks (Structured Knowledge)
	const { error: error1 } = await supabase.from('medical_knowledge_chunks').insert({
	specialty_id: specialtyId,
	disease_id: diseaseId,
	info_domain_id: infoDomainId,
	specialty: SPECIALTY_LABEL,
	chapter: chapterLabel,
	disease: diseaseLabel,
	section_title: sectionTitle,
	content: contentTrimmed,
	path: relativePath,
	embedding,
	});

	if (error1) {
	logger.error({ error: error1.message }, ` ❌ Error inserting structured chunk: ${sectionTitle}`);
	totalSkipped++;
	continue; // Skip legacy insert if structured failed
	}
	}

	// 2. Insert into guidelines + guideline_chunks (RAG Compatibility)
	// Check if guideline exists for this disease + section
	let guidelineId;
	const { data: existingGuideline } = await supabase
	.from('guidelines')
	.select('id')
	.eq('condition', diseaseLabel)
	.eq('source', sectionTitle) // Using section title as source/context
	.single();

	if (existingGuideline) {
	guidelineId = existingGuideline.id;
	} else {
	const { data: newGuideline, error: gError } = await supabase
	.from('guidelines')
	.insert({
	condition: diseaseLabel,
	source: sectionTitle,
	updated_at: new Date().toISOString()
	})
	.select('id')
	.single();

	if (gError \|\| !newGuideline) {
	logger.warn(` ⚠️ Failed to create guideline record: ${gError?.message}`);
	} else {
	guidelineId = newGuideline.id;
	}
	}

	if (guidelineId) {
	// Check if guideline chunk already exists to avoid duplicates
	const { data: existingGuidelineChunk } = await supabase
	.from('guideline_chunks')
	.select('id')
	.eq('guideline_id', guidelineId)
	.eq('content', contentTrimmed)
	.single();

	if (!existingGuidelineChunk) {
	// Insert chunk only if it doesn't exist
	const { error: chunkError } = await supabase.from('guideline_chunks').insert({
	guideline_id: guidelineId,
	content: contentTrimmed,
	embedding, // Reuse same embedding
	metadata: {
	chapter: chapterLabel,
	specialty: SPECIALTY_LABEL
	}
	});

	if (chunkError) {
	logger.warn(` ⚠️ Failed to insert guideline chunk: ${chunkError.message}`);
	}
	}
	}

	// Count as seeded only if we inserted a new medical chunk
	if (isNewMedicalChunk) {
	totalSeeded++;
	}


	} catch (error) {
	logger.error({ error }, ` ❌ Error processing file: ${filename}`);
	totalSkipped++;
	}
	}
	}
	}

	logger.info(`\n✅ Seeding completed!`);
	logger.info(` 📊 Total seeded: ${totalSeeded}`);
	logger.info(` ⏭️ Total duplicates (skipped): ${totalDuplicates}`);
	logger.info(` ⚠️ Total skipped (errors): ${totalSkipped}`);

	} catch (error) {
	if (error instanceof Error) {
	logger.error({ error: error.message, stack: error.stack }, 'Seeding failed');
	} else {
	logger.error({ error: JSON.stringify(error) }, 'Seeding failed');
	}
	process.exit(1);
	}
	}

	seedGuidelines();