File size: 1,811 Bytes
a8639ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os
def calculate_ascii_percentage(file_path):
try:
with open(file_path, "rb") as f:
data = f.read()
total_chars = len(data)
if total_chars == 0:
return 0
ascii_chars = sum(1 for c in data if 0 <= c <= 127)
percentage = (ascii_chars / total_chars) * 100
return percentage
except Exception as e:
print(f"Error: {e}")
return None
file_path = os.path.expanduser(
"~/torch_datasets/github-python/corpus/data/corpus_processed.txt"
)
ascii_percentage = calculate_ascii_percentage(file_path)
if ascii_percentage is not None:
print(f"Percentage of ASCII characters: {ascii_percentage:.2f}%")
def find_unicode_passages(file_path, threshold=0.5, min_length=20):
"""
Prints passages with a high density of non-ASCII characters.
Args:
file_path (str): Path to the input file.
threshold (float): Proportion of non-ASCII characters to flag a line.
min_length (int): Minimum length of a line to be considered.
"""
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
for line_num, line in enumerate(f, start=1):
total_chars = len(line.strip())
if total_chars < min_length:
continue # Skip short lines
non_ascii_count = sum(1 for c in line if ord(c) >= 128)
if non_ascii_count / total_chars > threshold:
print(f"Line {line_num}: {line.strip()}")
print(
f" -> Non-ASCII Density: {non_ascii_count / total_chars:.2%}"
)
except Exception as e:
print(f"Error: {e}")
# Example usage
find_unicode_passages(file_path, threshold=0.5, min_length=20)
|