File size: 2,873 Bytes
4d35814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/**
 * Text file processing utilities
 * Handles text file detection, reading, and validation
 */

import {
	DEFAULT_BINARY_DETECTION_OPTIONS,
	type BinaryDetectionOptions
} from '$lib/constants/binary-detection';
import { FileExtensionText } from '$lib/enums/files';

/**
 * Check if a filename indicates a text file based on its extension
 * @param filename - The filename to check
 * @returns True if the filename has a recognized text file extension
 */
export function isTextFileByName(filename: string): boolean {
	const textExtensions = Object.values(FileExtensionText);

	return textExtensions.some((ext: FileExtensionText) => filename.toLowerCase().endsWith(ext));
}

/**
 * Read a file's content as text
 * @param file - The file to read
 * @returns Promise resolving to the file's text content
 */
export async function readFileAsText(file: File): Promise<string> {
	return new Promise((resolve, reject) => {
		const reader = new FileReader();

		reader.onload = (event) => {
			if (event.target?.result !== null && event.target?.result !== undefined) {
				resolve(event.target.result as string);
			} else {
				reject(new Error('Failed to read file'));
			}
		};

		reader.onerror = () => reject(new Error('File reading error'));

		reader.readAsText(file);
	});
}

/**
 * Heuristic check to determine if content is likely from a text file
 * Detects binary files by counting suspicious characters and null bytes
 * @param content - The file content to analyze
 * @param options - Optional configuration for detection parameters
 * @returns True if the content appears to be text-based
 */
export function isLikelyTextFile(
	content: string,
	options: Partial<BinaryDetectionOptions> = {}
): boolean {
	if (!content) return true;

	const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
	const sample = content.substring(0, config.prefixLength);

	let nullCount = 0;
	let suspiciousControlCount = 0;

	for (let i = 0; i < sample.length; i++) {
		const charCode = sample.charCodeAt(i);

		// Count null bytes - these are strong indicators of binary files
		if (charCode === 0) {
			nullCount++;

			continue;
		}

		// Count suspicious control characters
		// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
		if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
			// Count most suspicious control characters
			if (charCode < 8 || (charCode > 13 && charCode < 27)) {
				suspiciousControlCount++;
			}
		}

		// Count replacement characters (indicates encoding issues)
		if (charCode === 0xfffd) {
			suspiciousControlCount++;
		}
	}

	// Reject if too many null bytes
	if (nullCount > config.maxAbsoluteNullBytes) return false;

	// Reject if too many suspicious characters
	if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;

	return true;
}