package bg.bas.dcl.LLMs.IfGPTDataset; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Writer; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.TreeSet; import info.debatty.java.lsh.MinHash; import bg.bas.dcl.general.FileHandler; /** * DeduplicationProcessor — sentence-level near-duplicate detection * using MinHash + LSH (Jaccard similarity). * * ----------------------------------------------------------------------- * MAVEN DEPENDENCY (add to pom.xml): * * * info.debatty * java-lsh * 0.12 * * * ----------------------------------------------------------------------- * HOW IT WORKS * * 1. INDEX phase — reads all .txt files in the "full corpus" directory. * Each sentence is shingled into character n-grams, converted to a * boolean vector over a shared vocabulary, and a MinHash signature * is computed. All signatures are stored in an in-memory index keyed * by (file, lineNumber). * * 2. QUERY phase — reads every sentence in the "new folder". * For each sentence its MinHash signature is compared against every * indexed corpus signature (approximate Jaccard via signature similarity). * Pairs whose estimated Jaccard similarity ≥ threshold are reported. * * 3. REPORT — a TSV report is written listing every duplicate pair: * new-file | new-line | corpus-file | corpus-line | similarity | sentence * * 4. OPTIONAL REMOVE — sentences in the new folder that are duplicates of * corpus sentences are stripped from their file (originals backed up). * Files that become empty after removal are deleted. * * ----------------------------------------------------------------------- * PARAMETERS * * threshold — Jaccard similarity to call a near-duplicate (default 0.90) * shingleSize — character n-gram size for shingling (default 5) * numHashes — number of hash functions for MinHash (default 200) * More hashes → better accuracy, slower index. * * ----------------------------------------------------------------------- * USAGE * * DeduplicationProcessor dp = new DeduplicationProcessor(0.90); * dp.indexCorpus("/path/to/full/corpus/"); * dp.detectDuplicates("/path/to/new/folder/", "/path/to/report.tsv"); * dp.removeDuplicatesFromNewFolder("/path/to/new/folder/", true); // true=keep .bak */ public class DeduplicationProcessor { // ----------------------------------------------------------------------- // Configuration // ----------------------------------------------------------------------- private final double threshold; // Jaccard similarity cut-off private final int shingleSize; // character n-gram size private final int numHashes; // MinHash signature length // ----------------------------------------------------------------------- // Index state (built during indexCorpus) // ----------------------------------------------------------------------- /** Shared vocabulary: every distinct shingle seen across all corpus sentences. */ private final Set vocabulary = new HashSet<>(); /** * Corpus index: maps SentenceKey → raw sentence text + MinHash signature. * Built in two passes to allow vocabulary to be finalised before signing. */ private final Map corpusIndex = new LinkedHashMap<>(); /** MinHash object — initialised once vocabulary size is known. */ private MinHash minHash; // ----------------------------------------------------------------------- // Duplicate results (populated by detectDuplicates) // ----------------------------------------------------------------------- /** All duplicate pairs found in the last detectDuplicates run. */ private final List duplicatePairs = new ArrayList<>(); /** * Set of SentenceKeys in the NEW folder that are duplicates. * Used by removeDuplicatesFromNewFolder. */ private final Set duplicateNewSentences = new HashSet<>(); // ----------------------------------------------------------------------- // Constructor // ----------------------------------------------------------------------- public DeduplicationProcessor(double threshold) { this(threshold, 5, 200); } public DeduplicationProcessor(double threshold, int shingleSize, int numHashes) { if (threshold < 0 || threshold > 1) throw new IllegalArgumentException("Threshold must be in [0, 1]."); this.threshold = threshold; this.shingleSize = shingleSize; this.numHashes = numHashes; } // ----------------------------------------------------------------------- // Phase 1 — Index the full corpus // ----------------------------------------------------------------------- /** * Reads all .txt files in {@code corpusDir}, shingles every sentence, * builds a shared vocabulary, and computes MinHash signatures. * * This must be called before {@link #detectDuplicates}. * * @param corpusDir directory of clean .txt files representing the full corpus */ public void indexCorpus(String corpusDir) { System.out.println("[Index] Scanning corpus: " + corpusDir); try { FileHandler fh = new FileHandler(); // --- Pass 1: collect sentences and build vocabulary --- // Temporary store: key → raw text + shingle set (signatures computed later) Map> rawShingles = new LinkedHashMap<>(); for (File f : fh.getFileListing(new File(corpusDir))) { if (!f.isFile() || !f.getName().endsWith(".txt")) continue; Scanner sc = new Scanner(f, "UTF-8"); int lineNum = 0; while (sc.hasNextLine()) { String line = sc.nextLine().trim(); lineNum++; if (line.length() < shingleSize) continue; Set shingles = shingle(line); vocabulary.addAll(shingles); rawShingles.put(new SentenceKey(f.getName(), lineNum), shingles); } sc.close(); } System.out.println("[Index] Vocabulary size: " + vocabulary.size() + " Sentences: " + rawShingles.size()); if (vocabulary.isEmpty()) { System.err.println("[Index] No sentences found — aborting."); return; } // --- Initialise MinHash with finalised vocabulary size --- // Error parameter 0.05 → ~400 hashes needed; we use numHashes directly. // The debatty MinHash constructor accepts (error, dictSize). // We use the lower-level approach: fix numHashes via the signature size. // info.debatty MinHash(double error, int dictSize) chooses hash count itself. // For explicit control we pass a small error so it aligns with numHashes. minHash = new MinHash(numHashes, vocabulary.size()); // --- Pass 2: compute and store signatures --- List vocabList = new ArrayList<>(vocabulary); corpusIndex.clear(); // Also keep a raw-text map for the report Map rawTexts = new HashMap<>(); // re-scan to get raw text (we only stored shingles above) for (File f : fh.getFileListing(new File(corpusDir))) { if (!f.isFile() || !f.getName().endsWith(".txt")) continue; Scanner sc = new Scanner(f, "UTF-8"); int lineNum = 0; while (sc.hasNextLine()) { String line = sc.nextLine().trim(); lineNum++; if (line.length() < shingleSize) continue; rawTexts.put(new SentenceKey(f.getName(), lineNum), line); } sc.close(); } for (Map.Entry> entry : rawShingles.entrySet()) { SentenceKey key = entry.getKey(); boolean[] vector = toVector(entry.getValue(), vocabList); int[] sig = minHash.signature(vector); String rawText = rawTexts.getOrDefault(key, ""); corpusIndex.put(key, new IndexedSentence(rawText, sig)); } System.out.println("[Index] Corpus index built: " + corpusIndex.size() + " sentences."); } catch (Exception e) { e.printStackTrace(); } } // ----------------------------------------------------------------------- // Phase 2 — Detect duplicates in new folder // ----------------------------------------------------------------------- /** * Compares every sentence in {@code newDir} against the corpus index. * Pairs with estimated Jaccard ≥ threshold are recorded as duplicates * and written to {@code reportPath}. * * Call {@link #indexCorpus} first. * * @param newDir directory of new .txt files to check * @param reportPath destination TSV report file */ public void detectDuplicates(String newDir, String reportPath) { if (corpusIndex.isEmpty()) { System.err.println("[Detect] Corpus index is empty. Call indexCorpus() first."); return; } System.out.println("[Detect] Comparing new folder against corpus index..."); duplicatePairs.clear(); duplicateNewSentences.clear(); List vocabList = new ArrayList<>(vocabulary); try { FileHandler fh = new FileHandler(); for (File f : fh.getFileListing(new File(newDir))) { if (!f.isFile() || !f.getName().endsWith(".txt")) continue; System.out.println("[Detect] Checking: " + f.getName()); Scanner sc = new Scanner(f, "UTF-8"); int lineNum = 0; while (sc.hasNextLine()) { String line = sc.nextLine().trim(); lineNum++; if (line.length() < shingleSize) continue; Set shingles = shingle(line); // Only shingles already in vocabulary are meaningful Set filtered = new HashSet<>(shingles); filtered.retainAll(vocabulary); // If almost none of the shingles are in vocab → skip // (the sentence is likely from a very different domain) if (filtered.isEmpty()) continue; boolean[] newVec = toVector(filtered, vocabList); int[] newSig = minHash.signature(newVec); SentenceKey newKey = new SentenceKey(f.getName(), lineNum); // Compare against all corpus sentences // For large corpora, replace this loop with an LSH band index for (Map.Entry entry : corpusIndex.entrySet()) { double sim = minHash.similarity(newSig, entry.getValue().signature); if (sim >= threshold) { DuplicatePair pair = new DuplicatePair( newKey, line, entry.getKey(), entry.getValue().text, sim); duplicatePairs.add(pair); duplicateNewSentences.add(newKey); // Don't break — report ALL corpus matches for transparency } } } sc.close(); } System.out.println("[Detect] Duplicate sentence pairs found: " + duplicatePairs.size()); System.out.println("[Detect] Unique new sentences flagged: " + duplicateNewSentences.size()); writeReport(reportPath); } catch (Exception e) { e.printStackTrace(); } } // ----------------------------------------------------------------------- // Phase 3 — Optionally remove duplicates from new folder // ----------------------------------------------------------------------- /** * Removes from every file in {@code newDir} any sentence whose * (file, lineNumber) is in the duplicate set detected by * {@link #detectDuplicates}. * * Files that become empty after removal are deleted. * Must be called after {@link #detectDuplicates}. * * @param newDir directory of new .txt files to clean * @param keepBackup if true, originals are renamed to *.bak first */ public void removeDuplicatesFromNewFolder(String newDir, boolean keepBackup) { if (duplicateNewSentences.isEmpty()) { System.out.println("[Remove] No duplicates to remove."); return; } System.out.println("[Remove] Removing " + duplicateNewSentences.size() + " duplicate sentences..."); try { FileHandler fh = new FileHandler(); int filesModified = 0; int totalRemoved = 0; for (File f : fh.getFileListing(new File(newDir))) { if (!f.isFile() || !f.getName().endsWith(".txt")) continue; List inputLines = new ArrayList<>(); Scanner sc = new Scanner(f, "UTF-8"); int lineNum = 0; while (sc.hasNextLine()) { inputLines.add(sc.nextLine()); lineNum++; } sc.close(); List outputLines = new ArrayList<>(); int removed = 0; for (int i = 0; i < inputLines.size(); i++) { String trimmed = inputLines.get(i).trim(); // +1 because lineNum was 1-based during indexing SentenceKey key = new SentenceKey(f.getName(), i + 1); if (trimmed.length() >= shingleSize && duplicateNewSentences.contains(key)) { removed++; } else { outputLines.add(inputLines.get(i)); } } if (removed > 0) { if (keepBackup) { Files.copy(f.toPath(), new File(f.getAbsolutePath() + ".bak").toPath(), StandardCopyOption.REPLACE_EXISTING); } // Check if file would become empty (only blank lines) boolean allBlank = outputLines.stream() .allMatch(String::isBlank); if (allBlank) { f.delete(); System.out.println("[Remove] Deleted (empty after dedup): " + f.getName()); } else { Writer w = new OutputStreamWriter( new FileOutputStream(f), "UTF-8"); for (String l : outputLines) { w.write(l + "\n"); } w.flush(); w.close(); System.out.println("[Remove] " + f.getName() + " — removed " + removed + " sentences."); } filesModified++; totalRemoved += removed; } } System.out.println("[Remove] Done. Files modified: " + filesModified + " Sentences removed: " + totalRemoved); } catch (Exception e) { e.printStackTrace(); } } // ----------------------------------------------------------------------- // Report writer // ----------------------------------------------------------------------- private void writeReport(String reportPath) throws Exception { try (PrintWriter pw = new PrintWriter( new OutputStreamWriter(new FileOutputStream(reportPath), "UTF-8"))) { // Header pw.println("# DeduplicationProcessor report"); pw.println("# Threshold: " + threshold + " ShingleSize: " + shingleSize + " NumHashes: " + numHashes); pw.println("# Duplicate pairs: " + duplicatePairs.size()); pw.println("# Unique new sentences flagged: " + duplicateNewSentences.size()); pw.println(); pw.println("NEW_FILE\tNEW_LINE\tCORPUS_FILE\tCORPUS_LINE\tSIMILARITY\tNEW_SENTENCE\tCORPUS_SENTENCE"); // Sort by similarity descending, then new file, then line List sorted = new ArrayList<>(duplicatePairs); sorted.sort((a, b) -> { int cmp = Double.compare(b.similarity, a.similarity); if (cmp != 0) return cmp; cmp = a.newKey.fileName.compareTo(b.newKey.fileName); if (cmp != 0) return cmp; return Integer.compare(a.newKey.lineNumber, b.newKey.lineNumber); }); for (DuplicatePair p : sorted) { pw.printf("%s\t%d\t%s\t%d\t%.4f\t%s\t%s%n", p.newKey.fileName, p.newKey.lineNumber, p.corpusKey.fileName, p.corpusKey.lineNumber, p.similarity, sanitiseTsv(p.newText), sanitiseTsv(p.corpusText)); } } System.out.println("[Report] Written to: " + reportPath); } // ----------------------------------------------------------------------- // Shingling and vectorisation helpers // ----------------------------------------------------------------------- /** * Produces the set of character n-grams (shingles) for a sentence. * Lowercased so matching is case-insensitive. */ private Set shingle(String text) { Set shingles = new TreeSet<>(); String lower = text.toLowerCase(); for (int i = 0; i <= lower.length() - shingleSize; i++) { shingles.add(lower.substring(i, i + shingleSize)); } return shingles; } /** * Converts a shingle set to a boolean presence vector over the shared vocabulary. * * @param shingles shingle set for this sentence * @param vocabList ordered list of all vocabulary shingles * @return boolean[] where true = shingle present */ private boolean[] toVector(Set shingles, List vocabList) { boolean[] vector = new boolean[vocabList.size()]; for (int i = 0; i < vocabList.size(); i++) { vector[i] = shingles.contains(vocabList.get(i)); } return vector; } // ----------------------------------------------------------------------- // Utility // ----------------------------------------------------------------------- private String sanitiseTsv(String s) { if (s == null) return ""; return s.replace("\t", " ").replace("\n", " ").replace("\r", ""); } /** Returns an unmodifiable view of all detected duplicate pairs. */ public List getDuplicatePairs() { return Collections.unmodifiableList(duplicatePairs); } /** Returns the number of corpus sentences indexed. */ public int getCorpusSize() { return corpusIndex.size(); } // ----------------------------------------------------------------------- // Inner data classes // ----------------------------------------------------------------------- /** * Uniquely identifies a sentence by its source file name and line number. */ public static class SentenceKey { public final String fileName; public final int lineNumber; public SentenceKey(String fileName, int lineNumber) { this.fileName = fileName; this.lineNumber = lineNumber; } @Override public boolean equals(Object o) { if (!(o instanceof SentenceKey)) return false; SentenceKey other = (SentenceKey) o; return lineNumber == other.lineNumber && fileName.equals(other.fileName); } @Override public int hashCode() { return 31 * fileName.hashCode() + lineNumber; } @Override public String toString() { return fileName + ":" + lineNumber; } } /** * Holds the raw text and MinHash signature for an indexed corpus sentence. */ private static class IndexedSentence { final String text; final int[] signature; IndexedSentence(String text, int[] signature) { this.text = text; this.signature = signature; } } /** * Represents a detected near-duplicate pair between a new sentence * and a corpus sentence. */ public static class DuplicatePair { public final SentenceKey newKey; public final String newText; public final SentenceKey corpusKey; public final String corpusText; public final double similarity; public DuplicatePair(SentenceKey newKey, String newText, SentenceKey corpusKey, String corpusText, double similarity) { this.newKey = newKey; this.newText = newText; this.corpusKey = corpusKey; this.corpusText = corpusText; this.similarity = similarity; } @Override public String toString() { return String.format("[%.2f] %s ↔ %s", similarity, newKey, corpusKey); } } }