IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /FileCleanProcessor.java
dcl-ibl-bas's picture
Upload 22 files
18573e4 verified
package bg.bas.dcl.LLMs.IfGPTDataset;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;
import bg.bas.dcl.general.FileHandler;
/**
* FileCleanProcessor — corpus boilerplate remover.
*
* Two-phase cleaning:
*
* Phase 1 — LEARN (from a sample directory):
* Scans every .txt file in the sample dir and records how many files each
* non-empty line appears in. Lines that appear in ≥ THRESHOLD of the
* sample files are added to the "common lines" blocklist.
* The blocklist is also saved to disk for inspection / reuse.
*
* Phase 2 — CLEAN (over the full data directory):
* For every .txt file, removes lines that:
* (a) appear in the learned common-lines blocklist, OR
* (b) match any of the hardcoded boilerplate regex patterns
* (HTML/XML tags, PHP markers, navigation patterns,
* URLs, e-mail addresses, cookie/GDPR banners).
* Cleaned files overwrite the originals (a .bak backup is kept by default).
*
* Usage:
* FileCleanProcessor fcp = new FileCleanProcessor(0.50); // 50 % threshold
* fcp.learnFromSample("/path/to/sample/dir/");
* fcp.saveBlocklist("/path/to/blocklist.txt"); // optional
* fcp.cleanDirectory("/path/to/full/data/dir/", true); // true = keep .bak
*/
public class FileCleanProcessor {
// -----------------------------------------------------------------------
// Configuration
// -----------------------------------------------------------------------
/** Fraction of sample files a line must appear in to be considered boilerplate. */
private final double threshold;
/** Minimum non-whitespace characters a line must have to be evaluated (avoids
* treating every blank separator the same way). */
private static final int MIN_LINE_LENGTH = 3;
// -----------------------------------------------------------------------
// State
// -----------------------------------------------------------------------
/** Lines found to be common across the sample (Phase 1 output). */
private final Set<String> commonLines = new HashSet<>();
/** Diagnostic: line → number of sample files it appeared in. */
private final Map<String, Integer> lineFrequency = new LinkedHashMap<>();
// -----------------------------------------------------------------------
// Hardcoded boilerplate patterns (always applied regardless of frequency)
// -----------------------------------------------------------------------
private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList(
// ---- HTML / XML tags ------------------------------------------------
Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"), // whole-line tag
Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"),
Pattern.compile("(?i).*</(script|style|head|body|html)>.*"),
Pattern.compile("(?i).*<!--.*-->.*"), // HTML comment
Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"), // HTML entities
// ---- PHP / server-side markers --------------------------------------
Pattern.compile("(?i).*<\\?php.*"),
Pattern.compile("(?i).*\\?>\\s*"),
Pattern.compile("(?i).*<%.*%>.*"), // ASP-style tags
// ---- Navigation / menu patterns ------------------------------------
Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation"
+ "|търсене|search|вход|login|изход|logout"
+ "|регистрация|register|контакти|contacts"
+ "|за нас|about us|sitemap|карта на сайта)\\s*$"),
Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен"
+ "|напред|назад|нагоре|back|forward|top|горе)\\s*$"),
Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"), // pipe-separated nav bars
Pattern.compile("(?i)^\\s*(>\\s*){2,}"), // breadcrumb: A > B > C
Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"), // numbered nav lists
// ---- URLs ----------------------------------------------------------
Pattern.compile("(?i)\\bhttps?://\\S+"),
Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"),
Pattern.compile("(?i)\\bftp://\\S+"),
// ---- E-mail addresses ----------------------------------------------
Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"),
// ---- Cookie / GDPR banners -----------------------------------------
Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност"
+ "|приемам|accept all|отхвърлям|decline|consent"
+ "|лични данни|personal data|условия за ползване"
+ "|terms of (use|service)|политика за).*"),
// ---- Social / sharing buttons --------------------------------------
Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet"
+ "|pinterest|linkedin|facebook|twitter|instagram"
+ "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"),
// ---- Counters / analytics snippets ---------------------------------
Pattern.compile("(?i).*google.analytics.*"),
Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"),
Pattern.compile("(?i).*gtag\\s*\\(.*"),
Pattern.compile("(?i).*_gaq\\.push.*"),
// ---- Print / date / page artefacts ---------------------------------
Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"), // "страница 1 от 5"
Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"),
Pattern.compile("(?i)^\\s*©.*$"), // copyright line
Pattern.compile("(?i)^\\s*all rights reserved.*$"),
Pattern.compile("(?i)^\\s*права запазени.*$"),
// ---- Lines that are purely punctuation / symbols -------------------
Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$")
);
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* @param threshold fraction [0,1] of sample files a line must appear in
* to be added to the blocklist (e.g. 0.50 for 50 %).
*/
public FileCleanProcessor(double threshold) {
if (threshold < 0 || threshold > 1)
throw new IllegalArgumentException("Threshold must be in [0, 1].");
this.threshold = threshold;
}
// -----------------------------------------------------------------------
// Phase 1 — Learn from sample
// -----------------------------------------------------------------------
/**
* Scans all .txt files in {@code sampleDir}, counts how many files each
* trimmed non-empty line appears in, and populates {@link #commonLines}
* with those meeting the threshold.
*
* @param sampleDir directory containing representative sample .txt files
*/
public void learnFromSample(String sampleDir) {
try {
FileHandler fh = new FileHandler();
List<File> sampleFiles = new ArrayList<>();
for (File f : fh.getFileListing(new File(sampleDir))) {
if (f.isFile() && f.getName().endsWith(".txt"))
sampleFiles.add(f);
}
int total = sampleFiles.size();
if (total == 0) {
System.err.println("[LearnPhase] No .txt files found in: " + sampleDir);
return;
}
System.out.println("[LearnPhase] Scanning " + total + " sample files...");
// For each file, collect the *distinct* lines it contains so a
// repeated line inside one document only counts once.
Map<String, Integer> fileCount = new HashMap<>();
for (File f : sampleFiles) {
Set<String> seenInFile = new HashSet<>();
Scanner s = new Scanner(f, "UTF-8");
while (s.hasNextLine()) {
String line = s.nextLine().trim();
if (line.length() < MIN_LINE_LENGTH) continue;
if (seenInFile.add(line)) { // first occurrence in this file
fileCount.merge(line, 1, Integer::sum);
}
}
s.close();
}
// Apply threshold
commonLines.clear();
lineFrequency.clear();
double cutoff = threshold * total;
for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
lineFrequency.put(entry.getKey(), entry.getValue());
if (entry.getValue() >= cutoff) {
commonLines.add(entry.getKey());
}
}
System.out.println("[LearnPhase] Common lines identified: " + commonLines.size()
+ " (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Replaces the learned common-lines set with a pre-built one.
* Useful when loading a previously saved blocklist.
*
* @param lines set of exact line strings to treat as boilerplate
*/
public void setCommonLines(Set<String> lines) {
commonLines.clear();
commonLines.addAll(lines);
}
// -----------------------------------------------------------------------
// Blocklist persistence
// -----------------------------------------------------------------------
/**
* Saves the learned blocklist to a plain-text file (one line per entry),
* preceded by a frequency comment for human review.
*
* @param outPath destination file path
*/
public void saveBlocklist(String outPath) {
try (PrintWriter pw = new PrintWriter(
new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) {
pw.println("# FileCleanProcessor blocklist");
pw.println("# threshold=" + threshold
+ " entries=" + commonLines.size());
pw.println("# Format: <frequency TAB line>");
pw.println();
// Sort by descending frequency for readability
lineFrequency.entrySet().stream()
.filter(e -> commonLines.contains(e.getKey()))
.sorted((a, b) -> b.getValue() - a.getValue())
.forEach(e -> pw.println(e.getValue() + "\t" + e.getKey()));
System.out.println("[Blocklist] Saved " + commonLines.size()
+ " entries to: " + outPath);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Loads a blocklist previously saved by {@link #saveBlocklist}.
* Comment lines (starting with #) and blank lines are skipped.
*
* @param blocklistPath path to the blocklist file
*/
public void loadBlocklist(String blocklistPath) {
try {
commonLines.clear();
Scanner sc = new Scanner(new File(blocklistPath), "UTF-8");
while (sc.hasNextLine()) {
String line = sc.nextLine();
if (line.startsWith("#") || line.isBlank()) continue;
// Format: "<freq>\t<content>" or bare "<content>"
int tab = line.indexOf('\t');
String content = (tab >= 0) ? line.substring(tab + 1) : line;
if (!content.isBlank()) commonLines.add(content.trim());
}
sc.close();
System.out.println("[Blocklist] Loaded " + commonLines.size()
+ " entries from: " + blocklistPath);
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------------------------------------------------------
// Phase 2 — Clean full directory
// -----------------------------------------------------------------------
/**
* Cleans every .txt file in {@code dataDir} by removing lines that are
* in the learned blocklist or match a hardcoded boilerplate pattern.
*
* @param dataDir directory containing corpus .txt files to clean
* @param keepBackup if true, originals are renamed to *.bak before overwriting
*/
public void cleanDirectory(String dataDir, boolean keepBackup) {
try {
if (commonLines.isEmpty()) {
System.out.println("[CleanPhase] Warning: no common lines loaded. "
+ "Only regex patterns will be applied.");
}
FileHandler fh = new FileHandler();
int processed = 0, linesRemoved = 0;
for (File f : fh.getFileListing(new File(dataDir))) {
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
CleanResult result = cleanFile(f, keepBackup);
processed++;
linesRemoved += result.linesRemoved;
if (result.linesRemoved > 0) {
System.out.println("[CleanPhase] " + f.getName()
+ " — removed " + result.linesRemoved + " lines.");
}
}
System.out.println("[CleanPhase] Done. Files processed: " + processed
+ " Total lines removed: " + linesRemoved);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Cleans a single file in place.
*
* @param file the .txt file to clean
* @param keepBackup if true, a .bak copy of the original is kept
* @return CleanResult with statistics
*/
public CleanResult cleanFile(File file, boolean keepBackup) {
int removed = 0;
try {
// Read all lines
List<String> inputLines = new ArrayList<>();
Scanner sc = new Scanner(file, "UTF-8");
while (sc.hasNextLine()) inputLines.add(sc.nextLine());
sc.close();
// Filter
List<String> outputLines = new ArrayList<>();
for (String line : inputLines) {
if (shouldRemove(line)) {
removed++;
} else {
outputLines.add(line);
}
}
if (removed > 0) {
// Backup
if (keepBackup) {
File bak = new File(file.getAbsolutePath() + ".bak");
Files.copy(file.toPath(), bak.toPath(),
StandardCopyOption.REPLACE_EXISTING);
}
// Overwrite
Writer w = new OutputStreamWriter(
new FileOutputStream(file), "UTF-8");
for (String l : outputLines) {
w.write(l + "\n");
}
w.flush();
w.close();
}
} catch (Exception e) {
e.printStackTrace();
}
return new CleanResult(file, removed);
}
// -----------------------------------------------------------------------
// Core line decision
// -----------------------------------------------------------------------
/**
* Returns true if the line should be removed.
*
* A line is removed if:
* 1. Its trimmed form is in the learned common-lines blocklist, OR
* 2. It matches any hardcoded boilerplate regex pattern.
*
* Blank lines shorter than MIN_LINE_LENGTH are always kept so that
* paragraph structure is preserved.
*
* @param rawLine the original line from the file (not yet trimmed)
*/
public boolean shouldRemove(String rawLine) {
String trimmed = rawLine.trim();
// Always keep blank/very-short lines (paragraph separators)
if (trimmed.length() < MIN_LINE_LENGTH) return false;
// 1. Exact-match blocklist
if (commonLines.contains(trimmed)) return true;
// 2. Regex boilerplate patterns
for (Pattern p : BOILERPLATE_PATTERNS) {
if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) {
return true;
}
}
return false;
}
// -----------------------------------------------------------------------
// Diagnostic helpers
// -----------------------------------------------------------------------
/** Returns an unmodifiable view of the learned common-lines set. */
public Set<String> getCommonLines() {
return java.util.Collections.unmodifiableSet(commonLines);
}
/** Returns a copy of the frequency map (line → number of sample files). */
public Map<String, Integer> getLineFrequency() {
return java.util.Collections.unmodifiableMap(lineFrequency);
}
/**
* Prints a summary of the top {@code n} most-frequent common lines to stdout.
*/
public void printTopCommonLines(int n) {
System.out.println("--- Top " + n + " common lines (by sample frequency) ---");
lineFrequency.entrySet().stream()
.filter(e -> commonLines.contains(e.getKey()))
.sorted((a, b) -> b.getValue() - a.getValue())
.limit(n)
.forEach(e -> System.out.printf(" [%4d] %s%n", e.getValue(), e.getKey()));
}
// -----------------------------------------------------------------------
// Inner result class
// -----------------------------------------------------------------------
/** Simple value object returned by {@link #cleanFile}. */
public static class CleanResult {
public final File file;
public final int linesRemoved;
public CleanResult(File file, int linesRemoved) {
this.file = file;
this.linesRemoved = linesRemoved;
}
}
}