File size: 3,527 Bytes
34844c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
WITH dedup_master_files AS (
  SELECT *
  FROM (
    SELECT
      *,
      LOWER(
        REGEXP_REPLACE(original_filename, r'^(Copy of\s*)+', '')
      ) AS cleaned_filename,
      ROW_NUMBER() OVER (
        PARTITION BY original_filename
        ORDER BY created_at DESC
      ) AS rn
    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.master_files`
  )
  WHERE rn = 1
),
dedup_page1_cover AS (
  SELECT *
  FROM (
    SELECT
      *,
      ROW_NUMBER() OVER (
        PARTITION BY file_id
        ORDER BY created_at DESC
      ) AS rn
    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page1_cover`
  )
  WHERE rn = 1
),

dedup_page2_identitas AS (
  SELECT *
  FROM (
    SELECT
      *,
      ROW_NUMBER() OVER (
        PARTITION BY file_id
        ORDER BY inserted_at DESC
      ) AS rn
    FROM `stellar-orb-451904-d9.raw_data_ocr_kia.page2_identitas`
  )
  WHERE rn = 1
),

label AS (
  SELECT
    filename AS filename_label,
    project_id_mother AS project_id_mother_label,
    CASE 
      WHEN first_name IS NOT NULL 
          AND last_name IS NOT NULL 
          AND first_name = last_name 
        THEN first_name
      ELSE CONCAT(
            IFNULL(first_name, ''), 
            IFNULL(
              CONCAT(' ', last_name), 
              ''
            )
          )
    END AS nama_ibu_cover_label,
    puskesmas_name AS dikeluarkan_oleh_fasilitas_kesehatan_label,
    regency_name AS kabupaten_kota_label,
    CASE 
    WHEN first_name IS NOT NULL 
        AND last_name IS NOT NULL 
        AND first_name = last_name 
      THEN first_name
    ELSE CONCAT(
          IFNULL(first_name, ''), 
          IFNULL(
            CONCAT(' ', last_name), 
            ''
          )
        )
    END AS nama_ibu,
    nik_mother AS nik_ibu_label,
    birth_date AS tempat_tanggal_lahir_ibu_label,
    address_street AS alamat_rumah_ibu_label,
    contact_number AS telepon_ibu_label,
    bpjs_mother AS no_jkn_ibu_label,
    education_level AS pendidikan_ibu_label,
    occupation AS pekerjaan_ibu_label,
    blood_type_result AS golongan_darah_ibu_label,
    age AS usia_ibu_label,
    pregnancy_number AS kehamilan_ke_label,
    number_live_birth AS jumlah_anak_lahir_hidup_label,
    number_birth_lost AS riwayat_keguguran_label,
    previous_preg_issue AS riwayat_penyakit_ibu_label
  FROM (
    SELECT
      *,
      ROW_NUMBER() OVER (
        PARTITION BY `filename`
        ORDER BY `end` DESC
      ) AS rn
    FROM `stellar-orb-451904-d9.ocr_bukukia_ocr_sid.List Correct Entry 2025`
  )
  WHERE
    rn = 1
)
,
main AS (
SELECT
  MF.id,
  MF.cleaned_filename AS `filename`,
  REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS project_id_mother,
  P1.nama_ibu_cover,
  P1.dikeluarkan_oleh_fasilitas_kesehatan,
  P1.kabupaten_kota,
  P2.nama_ibu,
  P2.nik_ibu,
  P2.tempat_tanggal_lahir_ibu,
  P2.alamat_rumah_ibu,
  P2.telepon_ibu,
  P2.no_jkn_ibu,
  P2.pendidikan_ibu,
  P2.pekerjaan_ibu,
  P2.golongan_darah_ibu,
  P2.usia_ibu,
  P2.kehamilan_ke,
  P2.jumlah_anak_lahir_hidup,
  P2.riwayat_keguguran,
  P2.riwayat_penyakit_ibu,
  L.*,
  ROW_NUMBER() OVER (
    PARTITION BY MF.`cleaned_filename`
    ORDER BY MF.`cleaned_filename` DESC
  ) AS rn
FROM dedup_master_files MF
LEFT JOIN dedup_page1_cover P1 ON MF.id = P1.id
LEFT JOIN dedup_page2_identitas P2 ON MF.id = P2.id
LEFT JOIN label L ON CAST(REGEXP_EXTRACT(MF.original_filename, r'(7000\d{4,})') AS INTEGER) = L.project_id_mother_label
)

SELECT *
FROM main
WHERE 
  rn = 1 and 
  id != "9a21ef00-6b02-4b04-81c3-68e25e2c8b7f" --duplicated