File size: 416 Bytes
54bef2f
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
# ...existing code...
import re
from pathlib import Path

p = Path("data/cache")
for f in p.glob("*.txt"):
    text = f.read_text(encoding="utf-8")
    # find repeated adjacent words like "word word" sequences
    matches = re.findall(r"\b(\w+)(?:\s+\1\b)+", text, flags=re.IGNORECASE)
    if matches:
        print(f"{f.name} has repeated words sample: {matches[:10]}")
    else:
        print(f"{f.name} looks ok")