| package bg.bas.dcl.LLMs.IfGPTDataset; |
|
|
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.PrintWriter; |
| import java.io.Writer; |
| import java.nio.file.Files; |
| import java.nio.file.StandardCopyOption; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Scanner; |
| import java.util.Set; |
| import java.util.TreeSet; |
|
|
| import info.debatty.java.lsh.MinHash; |
|
|
| import bg.bas.dcl.general.FileHandler; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public class DeduplicationProcessor { |
|
|
| |
| |
| |
|
|
| private final double threshold; |
| private final int shingleSize; |
| private final int numHashes; |
|
|
| |
| |
| |
|
|
| |
| private final Set<String> vocabulary = new HashSet<>(); |
|
|
| |
| |
| |
| |
| private final Map<SentenceKey, IndexedSentence> corpusIndex = new LinkedHashMap<>(); |
|
|
| |
| private MinHash minHash; |
|
|
| |
| |
| |
|
|
| |
| private final List<DuplicatePair> duplicatePairs = new ArrayList<>(); |
|
|
| |
| |
| |
| |
| private final Set<SentenceKey> duplicateNewSentences = new HashSet<>(); |
|
|
| |
| |
| |
|
|
| public DeduplicationProcessor(double threshold) { |
| this(threshold, 5, 200); |
| } |
|
|
| public DeduplicationProcessor(double threshold, int shingleSize, int numHashes) { |
| if (threshold < 0 || threshold > 1) |
| throw new IllegalArgumentException("Threshold must be in [0, 1]."); |
| this.threshold = threshold; |
| this.shingleSize = shingleSize; |
| this.numHashes = numHashes; |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| public void indexCorpus(String corpusDir) { |
| System.out.println("[Index] Scanning corpus: " + corpusDir); |
| try { |
| FileHandler fh = new FileHandler(); |
|
|
| |
| |
| Map<SentenceKey, Set<String>> rawShingles = new LinkedHashMap<>(); |
|
|
| for (File f : fh.getFileListing(new File(corpusDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| Scanner sc = new Scanner(f, "UTF-8"); |
| int lineNum = 0; |
| while (sc.hasNextLine()) { |
| String line = sc.nextLine().trim(); |
| lineNum++; |
| if (line.length() < shingleSize) continue; |
|
|
| Set<String> shingles = shingle(line); |
| vocabulary.addAll(shingles); |
| rawShingles.put(new SentenceKey(f.getName(), lineNum), shingles); |
| } |
| sc.close(); |
| } |
|
|
| System.out.println("[Index] Vocabulary size: " + vocabulary.size() |
| + " Sentences: " + rawShingles.size()); |
|
|
| if (vocabulary.isEmpty()) { |
| System.err.println("[Index] No sentences found — aborting."); |
| return; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| minHash = new MinHash(numHashes, vocabulary.size()); |
|
|
| |
| List<String> vocabList = new ArrayList<>(vocabulary); |
| corpusIndex.clear(); |
|
|
| |
| Map<SentenceKey, String> rawTexts = new HashMap<>(); |
| |
| for (File f : fh.getFileListing(new File(corpusDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
| Scanner sc = new Scanner(f, "UTF-8"); |
| int lineNum = 0; |
| while (sc.hasNextLine()) { |
| String line = sc.nextLine().trim(); |
| lineNum++; |
| if (line.length() < shingleSize) continue; |
| rawTexts.put(new SentenceKey(f.getName(), lineNum), line); |
| } |
| sc.close(); |
| } |
|
|
| for (Map.Entry<SentenceKey, Set<String>> entry : rawShingles.entrySet()) { |
| SentenceKey key = entry.getKey(); |
| boolean[] vector = toVector(entry.getValue(), vocabList); |
| int[] sig = minHash.signature(vector); |
| String rawText = rawTexts.getOrDefault(key, ""); |
| corpusIndex.put(key, new IndexedSentence(rawText, sig)); |
| } |
|
|
| System.out.println("[Index] Corpus index built: " |
| + corpusIndex.size() + " sentences."); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public void detectDuplicates(String newDir, String reportPath) { |
| if (corpusIndex.isEmpty()) { |
| System.err.println("[Detect] Corpus index is empty. Call indexCorpus() first."); |
| return; |
| } |
|
|
| System.out.println("[Detect] Comparing new folder against corpus index..."); |
| duplicatePairs.clear(); |
| duplicateNewSentences.clear(); |
|
|
| List<String> vocabList = new ArrayList<>(vocabulary); |
|
|
| try { |
| FileHandler fh = new FileHandler(); |
|
|
| for (File f : fh.getFileListing(new File(newDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| System.out.println("[Detect] Checking: " + f.getName()); |
|
|
| Scanner sc = new Scanner(f, "UTF-8"); |
| int lineNum = 0; |
|
|
| while (sc.hasNextLine()) { |
| String line = sc.nextLine().trim(); |
| lineNum++; |
| if (line.length() < shingleSize) continue; |
|
|
| Set<String> shingles = shingle(line); |
|
|
| |
| Set<String> filtered = new HashSet<>(shingles); |
| filtered.retainAll(vocabulary); |
|
|
| |
| |
| if (filtered.isEmpty()) continue; |
|
|
| boolean[] newVec = toVector(filtered, vocabList); |
| int[] newSig = minHash.signature(newVec); |
|
|
| SentenceKey newKey = new SentenceKey(f.getName(), lineNum); |
|
|
| |
| |
| for (Map.Entry<SentenceKey, IndexedSentence> entry : corpusIndex.entrySet()) { |
| double sim = minHash.similarity(newSig, entry.getValue().signature); |
| if (sim >= threshold) { |
| DuplicatePair pair = new DuplicatePair( |
| newKey, line, |
| entry.getKey(), entry.getValue().text, |
| sim); |
| duplicatePairs.add(pair); |
| duplicateNewSentences.add(newKey); |
| |
| } |
| } |
| } |
| sc.close(); |
| } |
|
|
| System.out.println("[Detect] Duplicate sentence pairs found: " |
| + duplicatePairs.size()); |
| System.out.println("[Detect] Unique new sentences flagged: " |
| + duplicateNewSentences.size()); |
|
|
| writeReport(reportPath); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public void removeDuplicatesFromNewFolder(String newDir, boolean keepBackup) { |
| if (duplicateNewSentences.isEmpty()) { |
| System.out.println("[Remove] No duplicates to remove."); |
| return; |
| } |
|
|
| System.out.println("[Remove] Removing " |
| + duplicateNewSentences.size() + " duplicate sentences..."); |
|
|
| try { |
| FileHandler fh = new FileHandler(); |
| int filesModified = 0; |
| int totalRemoved = 0; |
|
|
| for (File f : fh.getFileListing(new File(newDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| List<String> inputLines = new ArrayList<>(); |
| Scanner sc = new Scanner(f, "UTF-8"); |
| int lineNum = 0; |
| while (sc.hasNextLine()) { |
| inputLines.add(sc.nextLine()); |
| lineNum++; |
| } |
| sc.close(); |
|
|
| List<String> outputLines = new ArrayList<>(); |
| int removed = 0; |
|
|
| for (int i = 0; i < inputLines.size(); i++) { |
| String trimmed = inputLines.get(i).trim(); |
| |
| SentenceKey key = new SentenceKey(f.getName(), i + 1); |
|
|
| if (trimmed.length() >= shingleSize |
| && duplicateNewSentences.contains(key)) { |
| removed++; |
| } else { |
| outputLines.add(inputLines.get(i)); |
| } |
| } |
|
|
| if (removed > 0) { |
| if (keepBackup) { |
| Files.copy(f.toPath(), |
| new File(f.getAbsolutePath() + ".bak").toPath(), |
| StandardCopyOption.REPLACE_EXISTING); |
| } |
|
|
| |
| boolean allBlank = outputLines.stream() |
| .allMatch(String::isBlank); |
|
|
| if (allBlank) { |
| f.delete(); |
| System.out.println("[Remove] Deleted (empty after dedup): " |
| + f.getName()); |
| } else { |
| Writer w = new OutputStreamWriter( |
| new FileOutputStream(f), "UTF-8"); |
| for (String l : outputLines) { |
| w.write(l + "\n"); |
| } |
| w.flush(); |
| w.close(); |
| System.out.println("[Remove] " + f.getName() |
| + " — removed " + removed + " sentences."); |
| } |
|
|
| filesModified++; |
| totalRemoved += removed; |
| } |
| } |
|
|
| System.out.println("[Remove] Done. Files modified: " + filesModified |
| + " Sentences removed: " + totalRemoved); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| private void writeReport(String reportPath) throws Exception { |
| try (PrintWriter pw = new PrintWriter( |
| new OutputStreamWriter(new FileOutputStream(reportPath), "UTF-8"))) { |
|
|
| |
| pw.println("# DeduplicationProcessor report"); |
| pw.println("# Threshold: " + threshold |
| + " ShingleSize: " + shingleSize |
| + " NumHashes: " + numHashes); |
| pw.println("# Duplicate pairs: " + duplicatePairs.size()); |
| pw.println("# Unique new sentences flagged: " + duplicateNewSentences.size()); |
| pw.println(); |
| pw.println("NEW_FILE\tNEW_LINE\tCORPUS_FILE\tCORPUS_LINE\tSIMILARITY\tNEW_SENTENCE\tCORPUS_SENTENCE"); |
|
|
| |
| List<DuplicatePair> sorted = new ArrayList<>(duplicatePairs); |
| sorted.sort((a, b) -> { |
| int cmp = Double.compare(b.similarity, a.similarity); |
| if (cmp != 0) return cmp; |
| cmp = a.newKey.fileName.compareTo(b.newKey.fileName); |
| if (cmp != 0) return cmp; |
| return Integer.compare(a.newKey.lineNumber, b.newKey.lineNumber); |
| }); |
|
|
| for (DuplicatePair p : sorted) { |
| pw.printf("%s\t%d\t%s\t%d\t%.4f\t%s\t%s%n", |
| p.newKey.fileName, |
| p.newKey.lineNumber, |
| p.corpusKey.fileName, |
| p.corpusKey.lineNumber, |
| p.similarity, |
| sanitiseTsv(p.newText), |
| sanitiseTsv(p.corpusText)); |
| } |
| } |
| System.out.println("[Report] Written to: " + reportPath); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| private Set<String> shingle(String text) { |
| Set<String> shingles = new TreeSet<>(); |
| String lower = text.toLowerCase(); |
| for (int i = 0; i <= lower.length() - shingleSize; i++) { |
| shingles.add(lower.substring(i, i + shingleSize)); |
| } |
| return shingles; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| private boolean[] toVector(Set<String> shingles, List<String> vocabList) { |
| boolean[] vector = new boolean[vocabList.size()]; |
| for (int i = 0; i < vocabList.size(); i++) { |
| vector[i] = shingles.contains(vocabList.get(i)); |
| } |
| return vector; |
| } |
|
|
| |
| |
| |
|
|
| private String sanitiseTsv(String s) { |
| if (s == null) return ""; |
| return s.replace("\t", " ").replace("\n", " ").replace("\r", ""); |
| } |
|
|
| |
| public List<DuplicatePair> getDuplicatePairs() { |
| return Collections.unmodifiableList(duplicatePairs); |
| } |
|
|
| |
| public int getCorpusSize() { |
| return corpusIndex.size(); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| public static class SentenceKey { |
| public final String fileName; |
| public final int lineNumber; |
|
|
| public SentenceKey(String fileName, int lineNumber) { |
| this.fileName = fileName; |
| this.lineNumber = lineNumber; |
| } |
|
|
| @Override |
| public boolean equals(Object o) { |
| if (!(o instanceof SentenceKey)) return false; |
| SentenceKey other = (SentenceKey) o; |
| return lineNumber == other.lineNumber |
| && fileName.equals(other.fileName); |
| } |
|
|
| @Override |
| public int hashCode() { |
| return 31 * fileName.hashCode() + lineNumber; |
| } |
|
|
| @Override |
| public String toString() { |
| return fileName + ":" + lineNumber; |
| } |
| } |
|
|
| |
| |
| |
| private static class IndexedSentence { |
| final String text; |
| final int[] signature; |
|
|
| IndexedSentence(String text, int[] signature) { |
| this.text = text; |
| this.signature = signature; |
| } |
| } |
|
|
| |
| |
| |
| |
| public static class DuplicatePair { |
| public final SentenceKey newKey; |
| public final String newText; |
| public final SentenceKey corpusKey; |
| public final String corpusText; |
| public final double similarity; |
|
|
| public DuplicatePair(SentenceKey newKey, String newText, |
| SentenceKey corpusKey, String corpusText, |
| double similarity) { |
| this.newKey = newKey; |
| this.newText = newText; |
| this.corpusKey = corpusKey; |
| this.corpusText = corpusText; |
| this.similarity = similarity; |
| } |
|
|
| @Override |
| public String toString() { |
| return String.format("[%.2f] %s ↔ %s", similarity, newKey, corpusKey); |
| } |
| } |
| } |
|
|