from src.utils.helpers import clean_text, chunk_text from src.configs.config import MAX_TOKENS, LOG_DIR from src.utils.helpers import clean_text, chunk_text from src.configs.config import MAX_TOKENS, LOG_DIR import requests import fitz # PyMuPDF from io import BytesIO import pandas as pd import logging import os LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') def extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start): """Extract, clean, and chunk PDF text directly from URL (no local file).""" try: response = requests.get(pdf_url) response.raise_for_status() all_text = "" with fitz.open(stream=BytesIO(response.content), filetype="pdf") as doc: for page in doc: text = page.get_text("text") if text: text = text.encode('utf-8', errors='ignore').decode('utf-8') all_text += text + "\n" # Table extraction try: table_finder = page.find_tables() if table_finder.tables: logging.info(f"Tables detected on page {page.number}: {len(table_finder.tables)}") for table in table_finder.tables: for row in table.cells: row_text = [] for cell in row: if cell and isinstance(cell, str): cell = cell.encode('utf-8', errors='ignore').decode('utf-8') row_text.append(cell) else: row_text.append("") table_row = " | ".join(row_text) all_text += table_row + "\n" all_text += "\n" except Exception as table_error: logging.warning(f"Table extraction failed for page {page.number} in {pdf_title}: {table_error}") cleaned_text = clean_text(all_text) chunks = chunk_text(cleaned_text, max_tokens=MAX_TOKENS) chunk_data = [ { "faiss_index": faiss_index_start + i, "text": chunk } for i, chunk in enumerate(chunks) ] logging.info(f"Processed {pdf_title} from URL: {len(chunks)} chunks") return { "Id": doc_id, "download_link": pdf_url, "pdf_title": pdf_title, "chunks": chunk_data } except Exception as e: logging.error(f"Error processing {pdf_title} from URL {pdf_url}: {e}") return { "Id": doc_id, "download_link": pdf_url, "pdf_title": pdf_title, "chunks": [] } def preprocess_pdfs_from_csv(csv_path, max_docs=None): metadata = pd.read_csv(csv_path) if max_docs: metadata = metadata.head(max_docs) chunks_data = [] faiss_index_start = 0 for _, row in metadata.iterrows(): doc_id = row["Id"] pdf_title = row["Nom du document"] pdf_url = row["Lien"] chunk_data = extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start) chunks_data.append(chunk_data) faiss_index_start += len(chunk_data["chunks"]) return chunks_data