File size: 1,419 Bytes
9057a10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import PyPDF2
import os
from langdetect import detect
def read_txt_file(filepath: str) -> str:
"""Read content from a .txt file."""
try:
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
return content
except Exception as e:
print(f"β Error reading TXT file: {e}")
return ""
def read_pdf_file(filepath: str) -> str:
"""Extract text from a PDF file using PyPDF2."""
try:
with open(filepath, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text.strip()
except Exception as e:
print(f"β Error reading PDF file: {e}")
return ""
def read_file(filepath: str) -> str:
"""Read a file (txt or pdf) and return its content as text."""
if os.path.splitext(filepath)[1].lower() == ".txt":
return read_txt_file(filepath)
elif os.path.splitext(filepath)[1].lower() == ".pdf":
return read_pdf_file(filepath)
else:
print(f"β Unsupported file type: {filepath}")
return ""
def detect_language(text: str) -> str:
"""Detect the language of the given text using langdetect."""
try:
return detect(text)
except Exception:
return "unknown"
|