| | """ |
| | Text extraction module for Norwegian RAG chatbot. |
| | Extracts text from various document formats. |
| | """ |
| |
|
| | import os |
| | import PyPDF2 |
| | from typing import List, Optional |
| | from bs4 import BeautifulSoup |
| |
|
| | class TextExtractor: |
| | """ |
| | Extracts text from various document formats. |
| | Currently supports: |
| | - PDF (.pdf) |
| | - Text files (.txt) |
| | - HTML (.html, .htm) |
| | """ |
| | |
| | @staticmethod |
| | def extract_from_file(file_path: str) -> str: |
| | """ |
| | Extract text from a file based on its extension. |
| | |
| | Args: |
| | file_path: Path to the document file |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | if not os.path.exists(file_path): |
| | raise FileNotFoundError(f"File not found: {file_path}") |
| | |
| | file_extension = os.path.splitext(file_path)[1].lower() |
| | |
| | if file_extension == '.pdf': |
| | return TextExtractor.extract_from_pdf(file_path) |
| | elif file_extension == '.txt': |
| | return TextExtractor.extract_from_text(file_path) |
| | elif file_extension in ['.html', '.htm']: |
| | return TextExtractor.extract_from_html(file_path) |
| | else: |
| | raise ValueError(f"Unsupported file format: {file_extension}") |
| | |
| | @staticmethod |
| | def extract_from_pdf(file_path: str) -> str: |
| | """ |
| | Extract text from a PDF file. |
| | |
| | Args: |
| | file_path: Path to the PDF file |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | text = "" |
| | try: |
| | with open(file_path, 'rb') as file: |
| | pdf_reader = PyPDF2.PdfReader(file) |
| | for page_num in range(len(pdf_reader.pages)): |
| | page = pdf_reader.pages[page_num] |
| | text += page.extract_text() + "\n\n" |
| | except Exception as e: |
| | print(f"Error extracting text from PDF {file_path}: {str(e)}") |
| | return "" |
| | |
| | return text |
| | |
| | @staticmethod |
| | def extract_from_text(file_path: str) -> str: |
| | """ |
| | Extract text from a plain text file. |
| | |
| | Args: |
| | file_path: Path to the text file |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | try: |
| | with open(file_path, 'r', encoding='utf-8') as file: |
| | return file.read() |
| | except UnicodeDecodeError: |
| | |
| | try: |
| | with open(file_path, 'r', encoding='latin-1') as file: |
| | return file.read() |
| | except Exception as e: |
| | print(f"Error extracting text from file {file_path}: {str(e)}") |
| | return "" |
| | except Exception as e: |
| | print(f"Error extracting text from file {file_path}: {str(e)}") |
| | return "" |
| | |
| | @staticmethod |
| | def extract_from_html(file_path: str) -> str: |
| | """ |
| | Extract text from an HTML file. |
| | |
| | Args: |
| | file_path: Path to the HTML file |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | try: |
| | with open(file_path, 'r', encoding='utf-8') as file: |
| | html_content = file.read() |
| | soup = BeautifulSoup(html_content, 'html.parser') |
| | |
| | |
| | for script in soup(["script", "style"]): |
| | script.extract() |
| | |
| | |
| | text = soup.get_text() |
| | |
| | |
| | lines = (line.strip() for line in text.splitlines()) |
| | |
| | |
| | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| | |
| | |
| | text = '\n'.join(chunk for chunk in chunks if chunk) |
| | |
| | return text |
| | except Exception as e: |
| | print(f"Error extracting text from HTML {file_path}: {str(e)}") |
| | return "" |
| | |
| | @staticmethod |
| | def extract_from_url(url: str) -> str: |
| | """ |
| | Extract text from a web URL. |
| | |
| | Args: |
| | url: Web URL to extract text from |
| | |
| | Returns: |
| | Extracted text content |
| | """ |
| | try: |
| | import requests |
| | response = requests.get(url) |
| | soup = BeautifulSoup(response.content, 'html.parser') |
| | |
| | |
| | for script in soup(["script", "style"]): |
| | script.extract() |
| | |
| | |
| | text = soup.get_text() |
| | |
| | |
| | lines = (line.strip() for line in text.splitlines()) |
| | |
| | |
| | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| | |
| | |
| | text = '\n'.join(chunk for chunk in chunks if chunk) |
| | |
| | return text |
| | except Exception as e: |
| | print(f"Error extracting text from URL {url}: {str(e)}") |
| | return "" |
| |
|