Spaces:

Medagen
/

Backend

Sleeping

File size: 13,431 Bytes

d4abe4b

import { readdir, readFile, stat } from 'fs/promises';
import { join } from 'path';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import { createClient } from '@supabase/supabase-js';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { config, validateConfig } from '../utils/config.js';
import { logger } from '../utils/logger.js';

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// ===================== CONFIG =====================
const SUPABASE_URL = config.supabase.url;
const SUPABASE_SERVICE_KEY = config.supabase.serviceKey;
const GEMINI_API_KEY = config.gemini.apiKey;

const SPECIALTY_FOLDER = 'Da liễu'; // Can be changed: "Than-kinh", etc.
const SPECIALTY_LABEL = slugToLabel(SPECIALTY_FOLDER);

// ===================== INIT CLIENTS =====================
const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_KEY, {
  auth: { persistSession: false },
});

const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);
const embedModel = genAI.getGenerativeModel({ model: 'text-embedding-004' });

// ===================== HELPER FUNCTIONS =====================

/**
 * Convert slug to human-friendly label
 */
function slugToLabel(slug: string): string {
  let label = slug.replace(/[-_]+/g, ' ');
  label = label.replace(/\s+/g, ' ').trim();

// Remove "CHƯƠNG {n}" prefix entirely
label = label.replace(/^(CH(U|Ư)ƠNG)\s*\d+\s*[.:~-]?\s*/iu, '');

  // Remove leading numeric / roman numeral prefixes (e.g., "1.", "(IV)", "2-")
  label = label.replace(/^(?:\(?[0-9IVXLCDM]+\)?)(?:\s*[\.\-])?\s+/iu, '');

  // Cleanup remaining punctuation/spacing
  label = label.replace(/\s*:\s*/g, ': ');
  label = label.replace(/\s*-\s*/g, ' - ');
  label = label.replace(/\s+\./g, '. ');
  label = label.replace(/,\s*/g, ', ');
  label = label.replace(/\s{2,}/g, ' ').trim();

  if (!label) {
    label = slug.replace(/[-_]+/g, ' ').trim();
  }

  return label;
  }

/**
 * Parse section filename to human-readable title.
 * Supports formats like:
 *   - "ĐẠI CƯƠNG.txt"
 *   - "ĐẠI CƯƠNG_1.txt" (duplicate-safe suffix)
 */
function parseSectionFileName(filename: string): { title: string } | null {
  if (!filename.endsWith('.txt')) return null;
  const base = filename.replace(/\.txt$/i, '');
  const withoutDuplicateSuffix = base.replace(/_(\d+)$/, '');
  const title = withoutDuplicateSuffix.replace(/[_]+/g, ' ').trim();
  if (!title) return null;
  return { title };
}

/**
 * Generate embedding using Gemini
 */
async function embed(text: string): Promise<number[]> {
  const res = await embedModel.embedContent(text);
  return res.embedding.values;
      }

/**
 * Check if a directory entry is a directory
 */
async function isDirectory(path: string): Promise<boolean> {
  try {
    const stats = await stat(path);
    return stats.isDirectory();
  } catch {
    return false;
  }
}

/**
 * Get or create specialty record
 */
async function getOrCreateSpecialty(name: string): Promise<string | null> {
  // Try to find existing specialty
  const { data: existing, error: fetchError } = await supabase
    .from('specialties')
    .select('id')
    .eq('name', name)
    .single();

  if (existing) {
    return existing.id;
  }

  if (fetchError && fetchError.code !== 'PGRST116') {
    logger.error({ error: fetchError }, `Error fetching specialty: ${name}`);
    return null;
  }

  // Create new specialty
  const { data: created, error: createError } = await supabase
    .from('specialties')
    .insert({ name })
    .select('id')
    .single();

  if (createError) {
    logger.error({ error: createError }, `Error creating specialty: ${name}`);
    return null;
  }

  return created?.id || null;
}

/**
 * Get or create disease record
 */
async function getOrCreateDisease(
  name: string,
  specialtyId: string
): Promise<string | null> {
  // Try to find existing disease
  const { data: existing, error: fetchError } = await supabase
    .from('diseases')
    .select('id')
    .eq('name', name)
    .eq('specialty_id', specialtyId)
    .single();

  if (existing) {
    return existing.id;
  }

  if (fetchError && fetchError.code !== 'PGRST116') {
    logger.error({ error: fetchError }, `Error fetching disease: ${name}`);
    return null;
  }

  // Create new disease
  const { data: created, error: createError } = await supabase
    .from('diseases')
    .insert({ name, specialty_id: specialtyId })
    .select('id')
    .single();

  if (createError) {
    logger.error({ error: createError }, `Error creating disease: ${name}`);
    return null;
  }

  return created?.id || null;
}

/**
 * Get info domain ID by name
 */
async function getInfoDomainId(name: string): Promise<string | null> {
  const { data, error } = await supabase
    .from('info_domains')
    .select('id')
    .eq('name', name)
    .single();

  if (error) {
    // Try fuzzy match
    const normalizedName = name.toLowerCase().trim();
    const { data: allDomains } = await supabase
      .from('info_domains')
      .select('id, name');

    if (allDomains) {
      for (const domain of allDomains) {
        if (domain.name.toLowerCase().includes(normalizedName) || 
            normalizedName.includes(domain.name.toLowerCase())) {
          return domain.id;
        }
      }
    }
    return null;
  }

  return data?.id || null;
}

async function seedGuidelines() {
  try {
    logger.info(`Starting medical knowledge seeding for ${SPECIALTY_LABEL}...`);
    logger.info(`⚠️  This will seed ALL diseases in the ${SPECIALTY_FOLDER} folder.`);
    
    validateConfig();

    // Get data folder path (relative to project root)
    const dataRoot = join(__dirname, '../../data');
    const specialtyRoot = join(dataRoot, SPECIALTY_FOLDER);
    
    logger.info(`Reading data from: ${specialtyRoot}`);
    
    // Check if specialty folder exists
    try {
      await stat(specialtyRoot);
    } catch {
      logger.error(`Specialty folder not found: ${specialtyRoot}`);
      process.exit(1);
    }

    // Get or create specialty record
    const specialtyId = await getOrCreateSpecialty(SPECIALTY_LABEL);
    if (!specialtyId) {
      logger.error('Failed to get or create specialty record');
      process.exit(1);
    }
    logger.info(`Specialty ID: ${specialtyId}`);
    
    // Read chapter directories
    const chapterEntries = await readdir(specialtyRoot);
    let totalSeeded = 0;
    let totalSkipped = 0;
    let totalDuplicates = 0;

    // Process ALL chapters (no filtering)
    for (const chapterSlug of chapterEntries) {
      const chapterPath = join(specialtyRoot, chapterSlug);
      
      if (!(await isDirectory(chapterPath))) {
        continue;
      }

      const chapterLabel = slugToLabel(chapterSlug);
      logger.info(`\n📖 Processing chapter: ${chapterLabel}`);

      // Read disease directories
      const diseaseEntries = await readdir(chapterPath);

      // Process ALL diseases in this chapter (no filtering)
      for (const diseaseSlug of diseaseEntries) {
        const diseasePath = join(chapterPath, diseaseSlug);
        
        if (!(await isDirectory(diseasePath))) {
          continue;
        }
        
        const diseaseLabel = slugToLabel(diseaseSlug);
        logger.info(`  🩺 Processing disease: ${diseaseLabel}`);

        // Read section files
        const sectionFiles = await readdir(diseasePath);

        for (const filename of sectionFiles) {
          // Skip non-txt files and _raw.txt
          if (!filename.endsWith('.txt') || filename === '_raw.txt') {
            continue;
          }

          const sectionInfo = parseSectionFileName(filename);
          if (!sectionInfo) {
            logger.warn(`    ⚠️  Skipping invalid filename: ${filename}`);
            totalSkipped++;
            continue;
          }

          const { title: sectionTitle } = sectionInfo;
          const sectionPath = join(diseasePath, filename);
          
          try {
            const content = await readFile(sectionPath, 'utf-8');
            const contentTrimmed = content.trim();

            if (!contentTrimmed) {
              logger.warn(`    ⚠️  Skipping empty file: ${filename}`);
              totalSkipped++;
              continue;
            }

            const relativePath = sectionPath.replace(dataRoot + '/', '');

            logger.info(`    📄 ${sectionTitle} (${contentTrimmed.length} chars)`);
        
            // Get or create disease record
            const diseaseId = await getOrCreateDisease(diseaseLabel, specialtyId);
            
            // Get info domain ID (match section title with info domain)
            const infoDomainId = await getInfoDomainId(sectionTitle);
            
            // Generate embedding
            const embedding = await embed(contentTrimmed);

            // Check if this chunk already exists (by path) to avoid duplicates
            const { data: existingMedicalChunk } = await supabase
              .from('medical_knowledge_chunks')
              .select('id')
              .eq('path', relativePath)
              .single();

            let isNewMedicalChunk = false;
            if (existingMedicalChunk) {
              logger.info(`    ⏭️  Skipping duplicate medical chunk: ${sectionTitle} (already exists)`);
              totalDuplicates++;
              // Still insert into guideline_chunks for RAG compatibility
            } else {
              isNewMedicalChunk = true;
              // 1. Insert into medical_knowledge_chunks (Structured Knowledge)
              const { error: error1 } = await supabase.from('medical_knowledge_chunks').insert({
                specialty_id: specialtyId,
                disease_id: diseaseId,
                info_domain_id: infoDomainId,
                specialty: SPECIALTY_LABEL,
                chapter: chapterLabel,
                disease: diseaseLabel,
                section_title: sectionTitle,
                content: contentTrimmed,
                path: relativePath,
                embedding,
              });

              if (error1) {
                logger.error({ error: error1.message }, `    ❌ Error inserting structured chunk: ${sectionTitle}`);
                totalSkipped++;
                continue; // Skip legacy insert if structured failed
              }
            }

            // 2. Insert into guidelines + guideline_chunks (RAG Compatibility)
            // Check if guideline exists for this disease + section
            let guidelineId;
            const { data: existingGuideline } = await supabase
              .from('guidelines')
              .select('id')
              .eq('condition', diseaseLabel)
              .eq('source', sectionTitle) // Using section title as source/context
              .single();

            if (existingGuideline) {
              guidelineId = existingGuideline.id;
            } else {
              const { data: newGuideline, error: gError } = await supabase
                .from('guidelines')
                .insert({
                  condition: diseaseLabel,
                  source: sectionTitle,
                  updated_at: new Date().toISOString()
                })
                .select('id')
                .single();
              
              if (gError || !newGuideline) {
                logger.warn(`    ⚠️  Failed to create guideline record: ${gError?.message}`);
              } else {
                guidelineId = newGuideline.id;
              }
            }

            if (guidelineId) {
              // Check if guideline chunk already exists to avoid duplicates
              const { data: existingGuidelineChunk } = await supabase
                .from('guideline_chunks')
                .select('id')
                .eq('guideline_id', guidelineId)
                .eq('content', contentTrimmed)
                .single();

              if (!existingGuidelineChunk) {
                // Insert chunk only if it doesn't exist
                const { error: chunkError } = await supabase.from('guideline_chunks').insert({
                  guideline_id: guidelineId,
                  content: contentTrimmed,
                  embedding, // Reuse same embedding
                  metadata: {
                    chapter: chapterLabel,
                    specialty: SPECIALTY_LABEL
                  }
                });

                if (chunkError) {
                  logger.warn(`    ⚠️  Failed to insert guideline chunk: ${chunkError.message}`);
                }
              }
            }

            // Count as seeded only if we inserted a new medical chunk
            if (isNewMedicalChunk) {
              totalSeeded++;
            }

        
      } catch (error) {
            logger.error({ error }, `    ❌ Error processing file: ${filename}`);
            totalSkipped++;
          }
        }
      }
    }

    logger.info(`\n✅ Seeding completed!`);
    logger.info(`   📊 Total seeded: ${totalSeeded}`);
    logger.info(`   ⏭️  Total duplicates (skipped): ${totalDuplicates}`);
    logger.info(`   ⚠️  Total skipped (errors): ${totalSkipped}`);
    
  } catch (error) {
    if (error instanceof Error) {
      logger.error({ error: error.message, stack: error.stack }, 'Seeding failed');
    } else {
      logger.error({ error: JSON.stringify(error) }, 'Seeding failed');
    }
    process.exit(1);
  }
}

seedGuidelines();