File size: 18,689 Bytes
18573e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 | package bg.bas.dcl.LLMs.IfGPTDataset;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Pattern;
import bg.bas.dcl.general.FileHandler;
/**
* FileCleanProcessor — corpus boilerplate remover.
*
* Two-phase cleaning:
*
* Phase 1 — LEARN (from a sample directory):
* Scans every .txt file in the sample dir and records how many files each
* non-empty line appears in. Lines that appear in ≥ THRESHOLD of the
* sample files are added to the "common lines" blocklist.
* The blocklist is also saved to disk for inspection / reuse.
*
* Phase 2 — CLEAN (over the full data directory):
* For every .txt file, removes lines that:
* (a) appear in the learned common-lines blocklist, OR
* (b) match any of the hardcoded boilerplate regex patterns
* (HTML/XML tags, PHP markers, navigation patterns,
* URLs, e-mail addresses, cookie/GDPR banners).
* Cleaned files overwrite the originals (a .bak backup is kept by default).
*
* Usage:
* FileCleanProcessor fcp = new FileCleanProcessor(0.50); // 50 % threshold
* fcp.learnFromSample("/path/to/sample/dir/");
* fcp.saveBlocklist("/path/to/blocklist.txt"); // optional
* fcp.cleanDirectory("/path/to/full/data/dir/", true); // true = keep .bak
*/
public class FileCleanProcessor {
// -----------------------------------------------------------------------
// Configuration
// -----------------------------------------------------------------------
/** Fraction of sample files a line must appear in to be considered boilerplate. */
private final double threshold;
/** Minimum non-whitespace characters a line must have to be evaluated (avoids
* treating every blank separator the same way). */
private static final int MIN_LINE_LENGTH = 3;
// -----------------------------------------------------------------------
// State
// -----------------------------------------------------------------------
/** Lines found to be common across the sample (Phase 1 output). */
private final Set<String> commonLines = new HashSet<>();
/** Diagnostic: line → number of sample files it appeared in. */
private final Map<String, Integer> lineFrequency = new LinkedHashMap<>();
// -----------------------------------------------------------------------
// Hardcoded boilerplate patterns (always applied regardless of frequency)
// -----------------------------------------------------------------------
private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList(
// ---- HTML / XML tags ------------------------------------------------
Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"), // whole-line tag
Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"),
Pattern.compile("(?i).*</(script|style|head|body|html)>.*"),
Pattern.compile("(?i).*<!--.*-->.*"), // HTML comment
Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"), // HTML entities
// ---- PHP / server-side markers --------------------------------------
Pattern.compile("(?i).*<\\?php.*"),
Pattern.compile("(?i).*\\?>\\s*"),
Pattern.compile("(?i).*<%.*%>.*"), // ASP-style tags
// ---- Navigation / menu patterns ------------------------------------
Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation"
+ "|търсене|search|вход|login|изход|logout"
+ "|регистрация|register|контакти|contacts"
+ "|за нас|about us|sitemap|карта на сайта)\\s*$"),
Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен"
+ "|напред|назад|нагоре|back|forward|top|горе)\\s*$"),
Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"), // pipe-separated nav bars
Pattern.compile("(?i)^\\s*(>\\s*){2,}"), // breadcrumb: A > B > C
Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"), // numbered nav lists
// ---- URLs ----------------------------------------------------------
Pattern.compile("(?i)\\bhttps?://\\S+"),
Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"),
Pattern.compile("(?i)\\bftp://\\S+"),
// ---- E-mail addresses ----------------------------------------------
Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"),
// ---- Cookie / GDPR banners -----------------------------------------
Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност"
+ "|приемам|accept all|отхвърлям|decline|consent"
+ "|лични данни|personal data|условия за ползване"
+ "|terms of (use|service)|политика за).*"),
// ---- Social / sharing buttons --------------------------------------
Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet"
+ "|pinterest|linkedin|facebook|twitter|instagram"
+ "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"),
// ---- Counters / analytics snippets ---------------------------------
Pattern.compile("(?i).*google.analytics.*"),
Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"),
Pattern.compile("(?i).*gtag\\s*\\(.*"),
Pattern.compile("(?i).*_gaq\\.push.*"),
// ---- Print / date / page artefacts ---------------------------------
Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"), // "страница 1 от 5"
Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"),
Pattern.compile("(?i)^\\s*©.*$"), // copyright line
Pattern.compile("(?i)^\\s*all rights reserved.*$"),
Pattern.compile("(?i)^\\s*права запазени.*$"),
// ---- Lines that are purely punctuation / symbols -------------------
Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$")
);
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* @param threshold fraction [0,1] of sample files a line must appear in
* to be added to the blocklist (e.g. 0.50 for 50 %).
*/
public FileCleanProcessor(double threshold) {
if (threshold < 0 || threshold > 1)
throw new IllegalArgumentException("Threshold must be in [0, 1].");
this.threshold = threshold;
}
// -----------------------------------------------------------------------
// Phase 1 — Learn from sample
// -----------------------------------------------------------------------
/**
* Scans all .txt files in {@code sampleDir}, counts how many files each
* trimmed non-empty line appears in, and populates {@link #commonLines}
* with those meeting the threshold.
*
* @param sampleDir directory containing representative sample .txt files
*/
public void learnFromSample(String sampleDir) {
try {
FileHandler fh = new FileHandler();
List<File> sampleFiles = new ArrayList<>();
for (File f : fh.getFileListing(new File(sampleDir))) {
if (f.isFile() && f.getName().endsWith(".txt"))
sampleFiles.add(f);
}
int total = sampleFiles.size();
if (total == 0) {
System.err.println("[LearnPhase] No .txt files found in: " + sampleDir);
return;
}
System.out.println("[LearnPhase] Scanning " + total + " sample files...");
// For each file, collect the *distinct* lines it contains so a
// repeated line inside one document only counts once.
Map<String, Integer> fileCount = new HashMap<>();
for (File f : sampleFiles) {
Set<String> seenInFile = new HashSet<>();
Scanner s = new Scanner(f, "UTF-8");
while (s.hasNextLine()) {
String line = s.nextLine().trim();
if (line.length() < MIN_LINE_LENGTH) continue;
if (seenInFile.add(line)) { // first occurrence in this file
fileCount.merge(line, 1, Integer::sum);
}
}
s.close();
}
// Apply threshold
commonLines.clear();
lineFrequency.clear();
double cutoff = threshold * total;
for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
lineFrequency.put(entry.getKey(), entry.getValue());
if (entry.getValue() >= cutoff) {
commonLines.add(entry.getKey());
}
}
System.out.println("[LearnPhase] Common lines identified: " + commonLines.size()
+ " (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Replaces the learned common-lines set with a pre-built one.
* Useful when loading a previously saved blocklist.
*
* @param lines set of exact line strings to treat as boilerplate
*/
public void setCommonLines(Set<String> lines) {
commonLines.clear();
commonLines.addAll(lines);
}
// -----------------------------------------------------------------------
// Blocklist persistence
// -----------------------------------------------------------------------
/**
* Saves the learned blocklist to a plain-text file (one line per entry),
* preceded by a frequency comment for human review.
*
* @param outPath destination file path
*/
public void saveBlocklist(String outPath) {
try (PrintWriter pw = new PrintWriter(
new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) {
pw.println("# FileCleanProcessor blocklist");
pw.println("# threshold=" + threshold
+ " entries=" + commonLines.size());
pw.println("# Format: <frequency TAB line>");
pw.println();
// Sort by descending frequency for readability
lineFrequency.entrySet().stream()
.filter(e -> commonLines.contains(e.getKey()))
.sorted((a, b) -> b.getValue() - a.getValue())
.forEach(e -> pw.println(e.getValue() + "\t" + e.getKey()));
System.out.println("[Blocklist] Saved " + commonLines.size()
+ " entries to: " + outPath);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Loads a blocklist previously saved by {@link #saveBlocklist}.
* Comment lines (starting with #) and blank lines are skipped.
*
* @param blocklistPath path to the blocklist file
*/
public void loadBlocklist(String blocklistPath) {
try {
commonLines.clear();
Scanner sc = new Scanner(new File(blocklistPath), "UTF-8");
while (sc.hasNextLine()) {
String line = sc.nextLine();
if (line.startsWith("#") || line.isBlank()) continue;
// Format: "<freq>\t<content>" or bare "<content>"
int tab = line.indexOf('\t');
String content = (tab >= 0) ? line.substring(tab + 1) : line;
if (!content.isBlank()) commonLines.add(content.trim());
}
sc.close();
System.out.println("[Blocklist] Loaded " + commonLines.size()
+ " entries from: " + blocklistPath);
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------------------------------------------------------
// Phase 2 — Clean full directory
// -----------------------------------------------------------------------
/**
* Cleans every .txt file in {@code dataDir} by removing lines that are
* in the learned blocklist or match a hardcoded boilerplate pattern.
*
* @param dataDir directory containing corpus .txt files to clean
* @param keepBackup if true, originals are renamed to *.bak before overwriting
*/
public void cleanDirectory(String dataDir, boolean keepBackup) {
try {
if (commonLines.isEmpty()) {
System.out.println("[CleanPhase] Warning: no common lines loaded. "
+ "Only regex patterns will be applied.");
}
FileHandler fh = new FileHandler();
int processed = 0, linesRemoved = 0;
for (File f : fh.getFileListing(new File(dataDir))) {
if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
CleanResult result = cleanFile(f, keepBackup);
processed++;
linesRemoved += result.linesRemoved;
if (result.linesRemoved > 0) {
System.out.println("[CleanPhase] " + f.getName()
+ " — removed " + result.linesRemoved + " lines.");
}
}
System.out.println("[CleanPhase] Done. Files processed: " + processed
+ " Total lines removed: " + linesRemoved);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Cleans a single file in place.
*
* @param file the .txt file to clean
* @param keepBackup if true, a .bak copy of the original is kept
* @return CleanResult with statistics
*/
public CleanResult cleanFile(File file, boolean keepBackup) {
int removed = 0;
try {
// Read all lines
List<String> inputLines = new ArrayList<>();
Scanner sc = new Scanner(file, "UTF-8");
while (sc.hasNextLine()) inputLines.add(sc.nextLine());
sc.close();
// Filter
List<String> outputLines = new ArrayList<>();
for (String line : inputLines) {
if (shouldRemove(line)) {
removed++;
} else {
outputLines.add(line);
}
}
if (removed > 0) {
// Backup
if (keepBackup) {
File bak = new File(file.getAbsolutePath() + ".bak");
Files.copy(file.toPath(), bak.toPath(),
StandardCopyOption.REPLACE_EXISTING);
}
// Overwrite
Writer w = new OutputStreamWriter(
new FileOutputStream(file), "UTF-8");
for (String l : outputLines) {
w.write(l + "\n");
}
w.flush();
w.close();
}
} catch (Exception e) {
e.printStackTrace();
}
return new CleanResult(file, removed);
}
// -----------------------------------------------------------------------
// Core line decision
// -----------------------------------------------------------------------
/**
* Returns true if the line should be removed.
*
* A line is removed if:
* 1. Its trimmed form is in the learned common-lines blocklist, OR
* 2. It matches any hardcoded boilerplate regex pattern.
*
* Blank lines shorter than MIN_LINE_LENGTH are always kept so that
* paragraph structure is preserved.
*
* @param rawLine the original line from the file (not yet trimmed)
*/
public boolean shouldRemove(String rawLine) {
String trimmed = rawLine.trim();
// Always keep blank/very-short lines (paragraph separators)
if (trimmed.length() < MIN_LINE_LENGTH) return false;
// 1. Exact-match blocklist
if (commonLines.contains(trimmed)) return true;
// 2. Regex boilerplate patterns
for (Pattern p : BOILERPLATE_PATTERNS) {
if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) {
return true;
}
}
return false;
}
// -----------------------------------------------------------------------
// Diagnostic helpers
// -----------------------------------------------------------------------
/** Returns an unmodifiable view of the learned common-lines set. */
public Set<String> getCommonLines() {
return java.util.Collections.unmodifiableSet(commonLines);
}
/** Returns a copy of the frequency map (line → number of sample files). */
public Map<String, Integer> getLineFrequency() {
return java.util.Collections.unmodifiableMap(lineFrequency);
}
/**
* Prints a summary of the top {@code n} most-frequent common lines to stdout.
*/
public void printTopCommonLines(int n) {
System.out.println("--- Top " + n + " common lines (by sample frequency) ---");
lineFrequency.entrySet().stream()
.filter(e -> commonLines.contains(e.getKey()))
.sorted((a, b) -> b.getValue() - a.getValue())
.limit(n)
.forEach(e -> System.out.printf(" [%4d] %s%n", e.getValue(), e.getKey()));
}
// -----------------------------------------------------------------------
// Inner result class
// -----------------------------------------------------------------------
/** Simple value object returned by {@link #cleanFile}. */
public static class CleanResult {
public final File file;
public final int linesRemoved;
public CleanResult(File file, int linesRemoved) {
this.file = file;
this.linesRemoved = linesRemoved;
}
}
}
|