package bg.bas.dcl.LLMs.IfGPTDataset; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Scanner; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import bg.bas.dcl.general.FileHandler; /** * Processes the CURLICAT Bulgarian corpus. * * Input: CoNLL-UP files (.conllup) with richer inline metadata than MARCELL. * Output: One plain-text .txt per document + metadata.json + metadata CSV. * * Metadata comment prefixes recognised: * # PublicationDate = → PublicationDate * # DocumentTitle = → DocumentTitle * # Author = → Author * # DocumentType = → Type * # Url = → Url * # Style = → Style * # Domain = → Domain * # Subdomain = → Subdomain * # CollectionDate = → CollectionDate * # License = → Licence (overrides default if present) * * Default licence: CC-BY-SA-4.0. */ public class CurlicatProcessor extends BaseSourceProcessor { private static final String DEFAULT_LICENCE = "CC-BY-SA-4.0"; private static final String DEFAULT_LICENCE_LINK = "https://elrc-share.eu/static/metashare/licences/CC-BY-SA-4.0.pdf"; private static final String PREFIX = "bg_CURLICAT_"; private static final String EXT = ".conllup"; @Override public void process(String indir, String outdir) { try { FileHandler fh = new FileHandler(); JSONObject json = new JSONObject(); JSONArray descrArray = new JSONArray(); for (File f : fh.getFileListing(new File(indir))) { if (!f.isFile()) continue; System.out.println("Processing: " + f.getAbsolutePath()); String tfname = PREFIX + f.getName().replace(EXT, ""); JSONObject fdescr = newBaseDescriptor(tfname); fdescr.put("Licence", DEFAULT_LICENCE); fdescr.put("LicenceLink", DEFAULT_LICENCE_LINK); Writer out = new OutputStreamWriter( new FileOutputStream(outdir + tfname + ".txt"), "UTF-8"); Scanner s = new Scanner(f, "UTF-8"); int nw = 0, ns = 0, np = 0, nt = 0; while (s.hasNextLine()) { String line = s.nextLine(); // --- Metadata extraction --- if (line.startsWith("# PublicationDate =")) { fdescr.put("PublicationDate", line.replace("# PublicationDate =", "").trim()); } else if (line.startsWith("# DocumentTitle =")) { fdescr.put("DocumentTitle", line.replace("# DocumentTitle =", "").trim()); } else if (line.startsWith("# Author =")) { fdescr.put("Author", line.replace("# Author =", "").trim()); } else if (line.startsWith("# DocumentType =")) { fdescr.put("Type", line.replace("# DocumentType =", "").trim()); } else if (line.startsWith("# Url =")) { fdescr.put("Url", line.replace("# Url =", "").trim()); } else if (line.startsWith("# Style =")) { fdescr.put("Style", line.replace("# Style =", "").trim()); } else if (line.startsWith("# Domain =")) { fdescr.put("Domain", line.replace("# Domain =", "").trim()); } else if (line.startsWith("# Subdomain =")) { fdescr.put("Subdomain", line.replace("# Subdomain =", "").trim()); } else if (line.startsWith("# CollectionDate =")) { fdescr.put("CollectionDate", line.replace("# CollectionDate =", "").trim()); } else if (line.startsWith("# License =")) { // Override default licence if the file declares one fdescr.put("Licence", line.replace("# License =", "").trim()); } // --- Structure counting --- else if (line.startsWith("# sent_id =")) { ns++; } else if (line.startsWith("# newpar id =")) { np++; out.write("\n"); } // --- Text output --- else if (line.startsWith("# text =")) { out.write(line.replace("# text =", "").trim() + "\n"); out.flush(); } else { // CoNLL-UP token line String[] cols = line.split("\t"); if (cols.length > 5) { nt++; if (!cols[3].equals("PUNCT")) nw++; } } } s.close(); out.flush(); out.close(); fdescr.put("NumberWords", nw); fdescr.put("NumberSentences", ns); fdescr.put("NumberParagraphs", np); fdescr.put("NumberTokens", nt); descrArray.add(fdescr); } json.put("metadata", descrArray); writeMetadata(json, outdir, "metadata_CC.json"); } catch (Exception e) { e.printStackTrace(); } } // ----------------------------------------------------------------------- @SuppressWarnings("unchecked") private void writeMetadata(JSONObject json, String outdir, String filename) throws Exception { String outMetaPath = outdir + filename; Writer outMeta = new OutputStreamWriter( new FileOutputStream(outMetaPath), "UTF-8"); json.writeJSONString(outMeta); outMeta.flush(); outMeta.close(); convertJsonToCSV(json, outMetaPath + "_CSV.csv"); System.out.println("Metadata written to: " + outMetaPath); } }