| package bg.bas.dcl.LLMs.IfGPTDataset; |
|
|
| import java.io.FileWriter; |
| import java.io.PrintWriter; |
| import java.util.ArrayList; |
| import java.util.LinkedHashSet; |
|
|
| import org.json.simple.JSONArray; |
| import org.json.simple.JSONObject; |
|
|
| import bg.bas.dcl.general.JSONProcessor; |
|
|
| import java.io.File; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public abstract class BaseSourceProcessor implements SourceProcessor { |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| public void convertJsonToCSV(String metadataJsonPath) { |
| try { |
| JSONProcessor pr = new JSONProcessor(); |
| JSONObject json = pr.readJSON(new File(metadataJsonPath)); |
| convertJsonToCSV(json, metadataJsonPath + "_CSV.csv"); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| public void convertJsonToCSV(JSONObject json, String outCsvPath) { |
| try { |
| JSONArray array = (JSONArray) json.get("metadata"); |
|
|
| if (array == null || array.isEmpty()) { |
| System.err.println("[INCONSISTENCY] 'metadata' array is null or empty in: " + outCsvPath); |
| return; |
| } |
|
|
| |
| LinkedHashSet<String> headersSet = new LinkedHashSet<>(); |
| for (Object obj : array) { |
| if (obj instanceof JSONObject) { |
| headersSet.addAll(((JSONObject) obj).keySet()); |
| } else { |
| System.err.println("[INCONSISTENCY] Non-JSONObject entry found in metadata array."); |
| } |
| } |
|
|
| ArrayList<String> headers = new ArrayList<>(headersSet); |
|
|
| try (PrintWriter writer = new PrintWriter(new FileWriter(outCsvPath))) { |
|
|
| |
| writer.println(String.join(",", headers)); |
|
|
| |
| for (int i = 0; i < array.size(); i++) { |
| Object obj = array.get(i); |
|
|
| if (!(obj instanceof JSONObject)) { |
| System.err.println("[INCONSISTENCY] Row " + i + " is not a JSONObject, skipping."); |
| continue; |
| } |
|
|
| JSONObject row = (JSONObject) obj; |
|
|
| |
| for (String header : headers) { |
| if (!row.containsKey(header)) { |
| System.err.println("[INCONSISTENCY] Row " + i + " missing field: '" + header + "'"); |
| } |
| } |
| for (Object key : row.keySet()) { |
| if (!headersSet.contains(key.toString())) { |
| System.err.println("[INCONSISTENCY] Row " + i + " has unexpected field: '" + key + "'"); |
| } |
| } |
|
|
| |
| ArrayList<String> values = new ArrayList<>(); |
| for (String header : headers) { |
| Object value = row.get(header); |
| if (value == null) { |
| values.add(""); |
| } else { |
| String strVal = value.toString(); |
| if (strVal.contains(",") || strVal.contains("\"") || strVal.contains("\n")) { |
| strVal = "\"" + strVal.replace("\"", "\"\"") + "\""; |
| } |
| values.add(strVal); |
| } |
| } |
| writer.println(String.join(",", values)); |
| } |
| } |
|
|
| System.out.println("CSV written to: " + outCsvPath); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| protected int estimateTokenCount(String sentence) { |
| String[] words = sentence.split(" "); |
| int punctCount = sentence.length() |
| - sentence.replaceAll("[.,;:()?!\\-]", "").length(); |
| return words.length + punctCount; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| @SuppressWarnings("unchecked") |
| protected JSONObject newBaseDescriptor(String identifier) { |
| JSONObject fdescr = new JSONObject(); |
| fdescr.put("Identifier", identifier); |
| fdescr.put("Licence", ""); |
| fdescr.put("LicenceLink", ""); |
| fdescr.put("PublicationDate", ""); |
| fdescr.put("DocumentTitle", ""); |
| fdescr.put("Source", ""); |
| fdescr.put("Author", ""); |
| fdescr.put("Style", ""); |
| fdescr.put("Type", ""); |
| fdescr.put("Subdomain", ""); |
| fdescr.put("TranslatedDocument", ""); |
| fdescr.put("CollectionDate", ""); |
| fdescr.put("Medium", "text"); |
| fdescr.put("Url", ""); |
| fdescr.put("Domain", ""); |
| fdescr.put("Keywords", ""); |
| fdescr.put("PersonallyIdentifiableInformation", ""); |
| fdescr.put("BiasedInformation", ""); |
| fdescr.put("TaskCategories", ""); |
| fdescr.put("NumberWords", 0); |
| fdescr.put("NumberSentences", 0); |
| fdescr.put("NumberParagraphs", 0); |
| fdescr.put("NumberTokens", 0); |
| return fdescr; |
| } |
| } |
|
|