File size: 13,431 Bytes
d4abe4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
import { readdir, readFile, stat } from 'fs/promises';
import { join } from 'path';
import { fileURLToPath } from 'url';
import { dirname } from 'path';
import { createClient } from '@supabase/supabase-js';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { config, validateConfig } from '../utils/config.js';
import { logger } from '../utils/logger.js';

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

// ===================== CONFIG =====================
const SUPABASE_URL = config.supabase.url;
const SUPABASE_SERVICE_KEY = config.supabase.serviceKey;
const GEMINI_API_KEY = config.gemini.apiKey;

const SPECIALTY_FOLDER = 'Da liễu'; // Can be changed: "Than-kinh", etc.
const SPECIALTY_LABEL = slugToLabel(SPECIALTY_FOLDER);

// ===================== INIT CLIENTS =====================
const supabase = createClient(SUPABASE_URL, SUPABASE_SERVICE_KEY, {
  auth: { persistSession: false },
});

const genAI = new GoogleGenerativeAI(GEMINI_API_KEY);
const embedModel = genAI.getGenerativeModel({ model: 'text-embedding-004' });

// ===================== HELPER FUNCTIONS =====================

/**
 * Convert slug to human-friendly label
 */
function slugToLabel(slug: string): string {
  let label = slug.replace(/[-_]+/g, ' ');
  label = label.replace(/\s+/g, ' ').trim();

// Remove "CHƯƠNG {n}" prefix entirely
label = label.replace(/^(CH(U|Ư)ƠNG)\s*\d+\s*[.:~-]?\s*/iu, '');

  // Remove leading numeric / roman numeral prefixes (e.g., "1.", "(IV)", "2-")
  label = label.replace(/^(?:\(?[0-9IVXLCDM]+\)?)(?:\s*[\.\-])?\s+/iu, '');

  // Cleanup remaining punctuation/spacing
  label = label.replace(/\s*:\s*/g, ': ');
  label = label.replace(/\s*-\s*/g, ' - ');
  label = label.replace(/\s+\./g, '. ');
  label = label.replace(/,\s*/g, ', ');
  label = label.replace(/\s{2,}/g, ' ').trim();

  if (!label) {
    label = slug.replace(/[-_]+/g, ' ').trim();
  }

  return label;
  }

/**
 * Parse section filename to human-readable title.
 * Supports formats like:
 *   - "ĐẠI CƯƠNG.txt"
 *   - "ĐẠI CƯƠNG_1.txt" (duplicate-safe suffix)
 */
function parseSectionFileName(filename: string): { title: string } | null {
  if (!filename.endsWith('.txt')) return null;
  const base = filename.replace(/\.txt$/i, '');
  const withoutDuplicateSuffix = base.replace(/_(\d+)$/, '');
  const title = withoutDuplicateSuffix.replace(/[_]+/g, ' ').trim();
  if (!title) return null;
  return { title };
}

/**
 * Generate embedding using Gemini
 */
async function embed(text: string): Promise<number[]> {
  const res = await embedModel.embedContent(text);
  return res.embedding.values;
      }

/**
 * Check if a directory entry is a directory
 */
async function isDirectory(path: string): Promise<boolean> {
  try {
    const stats = await stat(path);
    return stats.isDirectory();
  } catch {
    return false;
  }
}

/**
 * Get or create specialty record
 */
async function getOrCreateSpecialty(name: string): Promise<string | null> {
  // Try to find existing specialty
  const { data: existing, error: fetchError } = await supabase
    .from('specialties')
    .select('id')
    .eq('name', name)
    .single();

  if (existing) {
    return existing.id;
  }

  if (fetchError && fetchError.code !== 'PGRST116') {
    logger.error({ error: fetchError }, `Error fetching specialty: ${name}`);
    return null;
  }

  // Create new specialty
  const { data: created, error: createError } = await supabase
    .from('specialties')
    .insert({ name })
    .select('id')
    .single();

  if (createError) {
    logger.error({ error: createError }, `Error creating specialty: ${name}`);
    return null;
  }

  return created?.id || null;
}

/**
 * Get or create disease record
 */
async function getOrCreateDisease(
  name: string,
  specialtyId: string
): Promise<string | null> {
  // Try to find existing disease
  const { data: existing, error: fetchError } = await supabase
    .from('diseases')
    .select('id')
    .eq('name', name)
    .eq('specialty_id', specialtyId)
    .single();

  if (existing) {
    return existing.id;
  }

  if (fetchError && fetchError.code !== 'PGRST116') {
    logger.error({ error: fetchError }, `Error fetching disease: ${name}`);
    return null;
  }

  // Create new disease
  const { data: created, error: createError } = await supabase
    .from('diseases')
    .insert({ name, specialty_id: specialtyId })
    .select('id')
    .single();

  if (createError) {
    logger.error({ error: createError }, `Error creating disease: ${name}`);
    return null;
  }

  return created?.id || null;
}

/**
 * Get info domain ID by name
 */
async function getInfoDomainId(name: string): Promise<string | null> {
  const { data, error } = await supabase
    .from('info_domains')
    .select('id')
    .eq('name', name)
    .single();

  if (error) {
    // Try fuzzy match
    const normalizedName = name.toLowerCase().trim();
    const { data: allDomains } = await supabase
      .from('info_domains')
      .select('id, name');

    if (allDomains) {
      for (const domain of allDomains) {
        if (domain.name.toLowerCase().includes(normalizedName) || 
            normalizedName.includes(domain.name.toLowerCase())) {
          return domain.id;
        }
      }
    }
    return null;
  }

  return data?.id || null;
}

async function seedGuidelines() {
  try {
    logger.info(`Starting medical knowledge seeding for ${SPECIALTY_LABEL}...`);
    logger.info(`⚠️  This will seed ALL diseases in the ${SPECIALTY_FOLDER} folder.`);
    
    validateConfig();

    // Get data folder path (relative to project root)
    const dataRoot = join(__dirname, '../../data');
    const specialtyRoot = join(dataRoot, SPECIALTY_FOLDER);
    
    logger.info(`Reading data from: ${specialtyRoot}`);
    
    // Check if specialty folder exists
    try {
      await stat(specialtyRoot);
    } catch {
      logger.error(`Specialty folder not found: ${specialtyRoot}`);
      process.exit(1);
    }

    // Get or create specialty record
    const specialtyId = await getOrCreateSpecialty(SPECIALTY_LABEL);
    if (!specialtyId) {
      logger.error('Failed to get or create specialty record');
      process.exit(1);
    }
    logger.info(`Specialty ID: ${specialtyId}`);
    
    // Read chapter directories
    const chapterEntries = await readdir(specialtyRoot);
    let totalSeeded = 0;
    let totalSkipped = 0;
    let totalDuplicates = 0;

    // Process ALL chapters (no filtering)
    for (const chapterSlug of chapterEntries) {
      const chapterPath = join(specialtyRoot, chapterSlug);
      
      if (!(await isDirectory(chapterPath))) {
        continue;
      }

      const chapterLabel = slugToLabel(chapterSlug);
      logger.info(`\n📖 Processing chapter: ${chapterLabel}`);

      // Read disease directories
      const diseaseEntries = await readdir(chapterPath);

      // Process ALL diseases in this chapter (no filtering)
      for (const diseaseSlug of diseaseEntries) {
        const diseasePath = join(chapterPath, diseaseSlug);
        
        if (!(await isDirectory(diseasePath))) {
          continue;
        }
        
        const diseaseLabel = slugToLabel(diseaseSlug);
        logger.info(`  🩺 Processing disease: ${diseaseLabel}`);

        // Read section files
        const sectionFiles = await readdir(diseasePath);

        for (const filename of sectionFiles) {
          // Skip non-txt files and _raw.txt
          if (!filename.endsWith('.txt') || filename === '_raw.txt') {
            continue;
          }

          const sectionInfo = parseSectionFileName(filename);
          if (!sectionInfo) {
            logger.warn(`    ⚠️  Skipping invalid filename: ${filename}`);
            totalSkipped++;
            continue;
          }

          const { title: sectionTitle } = sectionInfo;
          const sectionPath = join(diseasePath, filename);
          
          try {
            const content = await readFile(sectionPath, 'utf-8');
            const contentTrimmed = content.trim();

            if (!contentTrimmed) {
              logger.warn(`    ⚠️  Skipping empty file: ${filename}`);
              totalSkipped++;
              continue;
            }

            const relativePath = sectionPath.replace(dataRoot + '/', '');

            logger.info(`    📄 ${sectionTitle} (${contentTrimmed.length} chars)`);
        
            // Get or create disease record
            const diseaseId = await getOrCreateDisease(diseaseLabel, specialtyId);
            
            // Get info domain ID (match section title with info domain)
            const infoDomainId = await getInfoDomainId(sectionTitle);
            
            // Generate embedding
            const embedding = await embed(contentTrimmed);

            // Check if this chunk already exists (by path) to avoid duplicates
            const { data: existingMedicalChunk } = await supabase
              .from('medical_knowledge_chunks')
              .select('id')
              .eq('path', relativePath)
              .single();

            let isNewMedicalChunk = false;
            if (existingMedicalChunk) {
              logger.info(`    ⏭️  Skipping duplicate medical chunk: ${sectionTitle} (already exists)`);
              totalDuplicates++;
              // Still insert into guideline_chunks for RAG compatibility
            } else {
              isNewMedicalChunk = true;
              // 1. Insert into medical_knowledge_chunks (Structured Knowledge)
              const { error: error1 } = await supabase.from('medical_knowledge_chunks').insert({
                specialty_id: specialtyId,
                disease_id: diseaseId,
                info_domain_id: infoDomainId,
                specialty: SPECIALTY_LABEL,
                chapter: chapterLabel,
                disease: diseaseLabel,
                section_title: sectionTitle,
                content: contentTrimmed,
                path: relativePath,
                embedding,
              });

              if (error1) {
                logger.error({ error: error1.message }, `    ❌ Error inserting structured chunk: ${sectionTitle}`);
                totalSkipped++;
                continue; // Skip legacy insert if structured failed
              }
            }

            // 2. Insert into guidelines + guideline_chunks (RAG Compatibility)
            // Check if guideline exists for this disease + section
            let guidelineId;
            const { data: existingGuideline } = await supabase
              .from('guidelines')
              .select('id')
              .eq('condition', diseaseLabel)
              .eq('source', sectionTitle) // Using section title as source/context
              .single();

            if (existingGuideline) {
              guidelineId = existingGuideline.id;
            } else {
              const { data: newGuideline, error: gError } = await supabase
                .from('guidelines')
                .insert({
                  condition: diseaseLabel,
                  source: sectionTitle,
                  updated_at: new Date().toISOString()
                })
                .select('id')
                .single();
              
              if (gError || !newGuideline) {
                logger.warn(`    ⚠️  Failed to create guideline record: ${gError?.message}`);
              } else {
                guidelineId = newGuideline.id;
              }
            }

            if (guidelineId) {
              // Check if guideline chunk already exists to avoid duplicates
              const { data: existingGuidelineChunk } = await supabase
                .from('guideline_chunks')
                .select('id')
                .eq('guideline_id', guidelineId)
                .eq('content', contentTrimmed)
                .single();

              if (!existingGuidelineChunk) {
                // Insert chunk only if it doesn't exist
                const { error: chunkError } = await supabase.from('guideline_chunks').insert({
                  guideline_id: guidelineId,
                  content: contentTrimmed,
                  embedding, // Reuse same embedding
                  metadata: {
                    chapter: chapterLabel,
                    specialty: SPECIALTY_LABEL
                  }
                });

                if (chunkError) {
                  logger.warn(`    ⚠️  Failed to insert guideline chunk: ${chunkError.message}`);
                }
              }
            }

            // Count as seeded only if we inserted a new medical chunk
            if (isNewMedicalChunk) {
              totalSeeded++;
            }

        
      } catch (error) {
            logger.error({ error }, `    ❌ Error processing file: ${filename}`);
            totalSkipped++;
          }
        }
      }
    }

    logger.info(`\n✅ Seeding completed!`);
    logger.info(`   📊 Total seeded: ${totalSeeded}`);
    logger.info(`   ⏭️  Total duplicates (skipped): ${totalDuplicates}`);
    logger.info(`   ⚠️  Total skipped (errors): ${totalSkipped}`);
    
  } catch (error) {
    if (error instanceof Error) {
      logger.error({ error: error.message, stack: error.stack }, 'Seeding failed');
    } else {
      logger.error({ error: JSON.stringify(error) }, 'Seeding failed');
    }
    process.exit(1);
  }
}

seedGuidelines();