| package bg.bas.dcl.LLMs.IfGPTDataset; |
|
|
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.PrintWriter; |
| import java.io.Writer; |
| import java.nio.file.Files; |
| import java.nio.file.StandardCopyOption; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Scanner; |
| import java.util.Set; |
| import java.util.regex.Pattern; |
|
|
| import bg.bas.dcl.general.FileHandler; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public class FileCleanProcessor { |
|
|
| |
| |
| |
|
|
| |
| private final double threshold; |
|
|
| |
| |
| private static final int MIN_LINE_LENGTH = 3; |
|
|
| |
| |
| |
|
|
| |
| private final Set<String> commonLines = new HashSet<>(); |
|
|
| |
| private final Map<String, Integer> lineFrequency = new LinkedHashMap<>(); |
|
|
| |
| |
| |
|
|
| private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList( |
|
|
| |
| Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"), |
| Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"), |
| Pattern.compile("(?i).*</(script|style|head|body|html)>.*"), |
| Pattern.compile("(?i).*<!--.*-->.*"), |
| Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"), |
|
|
| |
| Pattern.compile("(?i).*<\\?php.*"), |
| Pattern.compile("(?i).*\\?>\\s*"), |
| Pattern.compile("(?i).*<%.*%>.*"), |
|
|
| |
| Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation" |
| + "|търсене|search|вход|login|изход|logout" |
| + "|регистрация|register|контакти|contacts" |
| + "|за нас|about us|sitemap|карта на сайта)\\s*$"), |
| Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен" |
| + "|напред|назад|нагоре|back|forward|top|горе)\\s*$"), |
| Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"), |
| Pattern.compile("(?i)^\\s*(>\\s*){2,}"), |
| Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"), |
|
|
| |
| Pattern.compile("(?i)\\bhttps?://\\S+"), |
| Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"), |
| Pattern.compile("(?i)\\bftp://\\S+"), |
|
|
| |
| Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"), |
|
|
| |
| Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност" |
| + "|приемам|accept all|отхвърлям|decline|consent" |
| + "|лични данни|personal data|условия за ползване" |
| + "|terms of (use|service)|политика за).*"), |
|
|
| |
| Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet" |
| + "|pinterest|linkedin|facebook|twitter|instagram" |
| + "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"), |
|
|
| |
| Pattern.compile("(?i).*google.analytics.*"), |
| Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"), |
| Pattern.compile("(?i).*gtag\\s*\\(.*"), |
| Pattern.compile("(?i).*_gaq\\.push.*"), |
|
|
| |
| Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"), |
| Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"), |
| Pattern.compile("(?i)^\\s*©.*$"), |
| Pattern.compile("(?i)^\\s*all rights reserved.*$"), |
| Pattern.compile("(?i)^\\s*права запазени.*$"), |
|
|
| |
| Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$") |
| ); |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| public FileCleanProcessor(double threshold) { |
| if (threshold < 0 || threshold > 1) |
| throw new IllegalArgumentException("Threshold must be in [0, 1]."); |
| this.threshold = threshold; |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| public void learnFromSample(String sampleDir) { |
| try { |
| FileHandler fh = new FileHandler(); |
| List<File> sampleFiles = new ArrayList<>(); |
|
|
| for (File f : fh.getFileListing(new File(sampleDir))) { |
| if (f.isFile() && f.getName().endsWith(".txt")) |
| sampleFiles.add(f); |
| } |
|
|
| int total = sampleFiles.size(); |
| if (total == 0) { |
| System.err.println("[LearnPhase] No .txt files found in: " + sampleDir); |
| return; |
| } |
| System.out.println("[LearnPhase] Scanning " + total + " sample files..."); |
|
|
| |
| |
| Map<String, Integer> fileCount = new HashMap<>(); |
|
|
| for (File f : sampleFiles) { |
| Set<String> seenInFile = new HashSet<>(); |
| Scanner s = new Scanner(f, "UTF-8"); |
| while (s.hasNextLine()) { |
| String line = s.nextLine().trim(); |
| if (line.length() < MIN_LINE_LENGTH) continue; |
| if (seenInFile.add(line)) { |
| fileCount.merge(line, 1, Integer::sum); |
| } |
| } |
| s.close(); |
| } |
|
|
| |
| commonLines.clear(); |
| lineFrequency.clear(); |
|
|
| double cutoff = threshold * total; |
| for (Map.Entry<String, Integer> entry : fileCount.entrySet()) { |
| lineFrequency.put(entry.getKey(), entry.getValue()); |
| if (entry.getValue() >= cutoff) { |
| commonLines.add(entry.getKey()); |
| } |
| } |
|
|
| System.out.println("[LearnPhase] Common lines identified: " + commonLines.size() |
| + " (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")"); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| public void setCommonLines(Set<String> lines) { |
| commonLines.clear(); |
| commonLines.addAll(lines); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| public void saveBlocklist(String outPath) { |
| try (PrintWriter pw = new PrintWriter( |
| new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) { |
|
|
| pw.println("# FileCleanProcessor blocklist"); |
| pw.println("# threshold=" + threshold |
| + " entries=" + commonLines.size()); |
| pw.println("# Format: <frequency TAB line>"); |
| pw.println(); |
|
|
| |
| lineFrequency.entrySet().stream() |
| .filter(e -> commonLines.contains(e.getKey())) |
| .sorted((a, b) -> b.getValue() - a.getValue()) |
| .forEach(e -> pw.println(e.getValue() + "\t" + e.getKey())); |
|
|
| System.out.println("[Blocklist] Saved " + commonLines.size() |
| + " entries to: " + outPath); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| public void loadBlocklist(String blocklistPath) { |
| try { |
| commonLines.clear(); |
| Scanner sc = new Scanner(new File(blocklistPath), "UTF-8"); |
| while (sc.hasNextLine()) { |
| String line = sc.nextLine(); |
| if (line.startsWith("#") || line.isBlank()) continue; |
| |
| int tab = line.indexOf('\t'); |
| String content = (tab >= 0) ? line.substring(tab + 1) : line; |
| if (!content.isBlank()) commonLines.add(content.trim()); |
| } |
| sc.close(); |
| System.out.println("[Blocklist] Loaded " + commonLines.size() |
| + " entries from: " + blocklistPath); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| public void cleanDirectory(String dataDir, boolean keepBackup) { |
| try { |
| if (commonLines.isEmpty()) { |
| System.out.println("[CleanPhase] Warning: no common lines loaded. " |
| + "Only regex patterns will be applied."); |
| } |
|
|
| FileHandler fh = new FileHandler(); |
| int processed = 0, linesRemoved = 0; |
|
|
| for (File f : fh.getFileListing(new File(dataDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
|
|
| CleanResult result = cleanFile(f, keepBackup); |
| processed++; |
| linesRemoved += result.linesRemoved; |
|
|
| if (result.linesRemoved > 0) { |
| System.out.println("[CleanPhase] " + f.getName() |
| + " — removed " + result.linesRemoved + " lines."); |
| } |
| } |
|
|
| System.out.println("[CleanPhase] Done. Files processed: " + processed |
| + " Total lines removed: " + linesRemoved); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| public CleanResult cleanFile(File file, boolean keepBackup) { |
| int removed = 0; |
| try { |
| |
| List<String> inputLines = new ArrayList<>(); |
| Scanner sc = new Scanner(file, "UTF-8"); |
| while (sc.hasNextLine()) inputLines.add(sc.nextLine()); |
| sc.close(); |
|
|
| |
| List<String> outputLines = new ArrayList<>(); |
| for (String line : inputLines) { |
| if (shouldRemove(line)) { |
| removed++; |
| } else { |
| outputLines.add(line); |
| } |
| } |
|
|
| if (removed > 0) { |
| |
| if (keepBackup) { |
| File bak = new File(file.getAbsolutePath() + ".bak"); |
| Files.copy(file.toPath(), bak.toPath(), |
| StandardCopyOption.REPLACE_EXISTING); |
| } |
|
|
| |
| Writer w = new OutputStreamWriter( |
| new FileOutputStream(file), "UTF-8"); |
| for (String l : outputLines) { |
| w.write(l + "\n"); |
| } |
| w.flush(); |
| w.close(); |
| } |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| return new CleanResult(file, removed); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public boolean shouldRemove(String rawLine) { |
| String trimmed = rawLine.trim(); |
|
|
| |
| if (trimmed.length() < MIN_LINE_LENGTH) return false; |
|
|
| |
| if (commonLines.contains(trimmed)) return true; |
|
|
| |
| for (Pattern p : BOILERPLATE_PATTERNS) { |
| if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) { |
| return true; |
| } |
| } |
|
|
| return false; |
| } |
|
|
| |
| |
| |
|
|
| |
| public Set<String> getCommonLines() { |
| return java.util.Collections.unmodifiableSet(commonLines); |
| } |
|
|
| |
| public Map<String, Integer> getLineFrequency() { |
| return java.util.Collections.unmodifiableMap(lineFrequency); |
| } |
|
|
| |
| |
| |
| public void printTopCommonLines(int n) { |
| System.out.println("--- Top " + n + " common lines (by sample frequency) ---"); |
| lineFrequency.entrySet().stream() |
| .filter(e -> commonLines.contains(e.getKey())) |
| .sorted((a, b) -> b.getValue() - a.getValue()) |
| .limit(n) |
| .forEach(e -> System.out.printf(" [%4d] %s%n", e.getValue(), e.getKey())); |
| } |
|
|
| |
| |
| |
|
|
| |
| public static class CleanResult { |
| public final File file; |
| public final int linesRemoved; |
|
|
| public CleanResult(File file, int linesRemoved) { |
| this.file = file; |
| this.linesRemoved = linesRemoved; |
| } |
| } |
| } |
|
|