| import os |
| import re |
| import fitz |
| import logging |
| from PIL import Image |
| from pdf2image import convert_from_path |
| import platform |
| import pytesseract |
| import docx |
| from odf.opendocument import load as load_odt |
| from odf.text import P |
|
|
| |
| |
| |
| |
| |
| pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| def extract_text_from_pdf(file_path): |
| text = "" |
| hyperlinks = [] |
| try: |
| doc = fitz.open(file_path) |
| for page_num in range(doc.page_count): |
| page = doc.load_page(page_num) |
| page_text = page.get_text("text") |
|
|
| if not page_text.strip(): |
| images = convert_from_path(file_path, dpi=300) |
| for image in images: |
| text += pytesseract.image_to_string(image) |
| else: |
| text += page_text |
|
|
| links = page.get_links() |
| for link in links: |
| if link.get("uri"): |
| hyperlinks.append(link["uri"]) |
| except Exception as e: |
| logging.error(f"Error extracting text or hyperlinks from PDF: {e}") |
| return "", [] |
| |
| return text, list(set(hyperlinks)) |
|
|
| |
| def extract_text_from_docx(file_path): |
| try: |
| doc = docx.Document(file_path) |
| text = "\n".join([para.text for para in doc.paragraphs]) |
| return text |
| except Exception as e: |
| logging.error(f"Error extracting text from DOCX: {e}") |
| return "" |
|
|
| |
| def extract_text_from_rsf(file_path): |
| try: |
| with open(file_path, "r", encoding="utf-8") as file: |
| return file.read() |
| except Exception as e: |
| logging.error(f"Error extracting text from RSF: {e}") |
| return "" |
|
|
| |
| def extract_text_from_odt(file_path): |
| try: |
| odt_doc = load_odt(file_path) |
| text_elements = odt_doc.getElementsByType(P) |
| text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild]) |
| return text |
| except Exception as e: |
| logging.error(f"Error extracting text from ODT: {e}") |
| return "" |
|
|
| |
| def extract_text_from_image(file_path): |
| try: |
| img = Image.open(file_path) |
| text = pytesseract.image_to_string(img) |
| return text |
| except Exception as e: |
| logging.error(f"Error extracting text from image: {e}") |
| return "" |
|
|
| |
| def preprocess_text(text): |
| text = re.sub(r'\s+', ' ', text) |
| text = re.sub(r'\n', ' ', text) |
| text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text) |
| return text.strip() |
|
|
| |
| def extract_text_based_on_format(file_path): |
| file_ext = os.path.splitext(file_path)[1].lower() |
|
|
| if file_ext == '.pdf': |
| text, hyperlinks = extract_text_from_pdf(file_path) |
| elif file_ext == '.docx': |
| text = extract_text_from_docx(file_path) |
| hyperlinks = [] |
| elif file_ext == '.rsf': |
| text = extract_text_from_rsf(file_path) |
| hyperlinks = [] |
| elif file_ext == '.odt': |
| text = extract_text_from_odt(file_path) |
| hyperlinks = [] |
| elif file_ext in ['.png', '.jpg', '.jpeg']: |
| text = extract_text_from_image(file_path) |
| hyperlinks = [] |
| else: |
| raise ValueError("Unsupported file format") |
|
|
| return text, hyperlinks |
|
|
|
|
| def clean_text_to_single_line(text): |
| |
| cleaned_text = ' '.join(text.split()) |
| return cleaned_text |