Spaces:
Sleeping
Sleeping
| from src.utils.helpers import clean_text, chunk_text | |
| from src.configs.config import MAX_TOKENS, LOG_DIR | |
| from src.utils.helpers import clean_text, chunk_text | |
| from src.configs.config import MAX_TOKENS, LOG_DIR | |
| import requests | |
| import fitz # PyMuPDF | |
| from io import BytesIO | |
| import pandas as pd | |
| import logging | |
| import os | |
| LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') | |
| def extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start): | |
| """Extract, clean, and chunk PDF text directly from URL (no local file).""" | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() | |
| all_text = "" | |
| with fitz.open(stream=BytesIO(response.content), filetype="pdf") as doc: | |
| for page in doc: | |
| text = page.get_text("text") | |
| if text: | |
| text = text.encode('utf-8', errors='ignore').decode('utf-8') | |
| all_text += text + "\n" | |
| # Table extraction | |
| try: | |
| table_finder = page.find_tables() | |
| if table_finder.tables: | |
| logging.info(f"Tables detected on page {page.number}: {len(table_finder.tables)}") | |
| for table in table_finder.tables: | |
| for row in table.cells: | |
| row_text = [] | |
| for cell in row: | |
| if cell and isinstance(cell, str): | |
| cell = cell.encode('utf-8', errors='ignore').decode('utf-8') | |
| row_text.append(cell) | |
| else: | |
| row_text.append("") | |
| table_row = " | ".join(row_text) | |
| all_text += table_row + "\n" | |
| all_text += "\n" | |
| except Exception as table_error: | |
| logging.warning(f"Table extraction failed for page {page.number} in {pdf_title}: {table_error}") | |
| cleaned_text = clean_text(all_text) | |
| chunks = chunk_text(cleaned_text, max_tokens=MAX_TOKENS) | |
| chunk_data = [ | |
| { | |
| "faiss_index": faiss_index_start + i, | |
| "text": chunk | |
| } | |
| for i, chunk in enumerate(chunks) | |
| ] | |
| logging.info(f"Processed {pdf_title} from URL: {len(chunks)} chunks") | |
| return { | |
| "Id": doc_id, | |
| "download_link": pdf_url, | |
| "pdf_title": pdf_title, | |
| "chunks": chunk_data | |
| } | |
| except Exception as e: | |
| logging.error(f"Error processing {pdf_title} from URL {pdf_url}: {e}") | |
| return { | |
| "Id": doc_id, | |
| "download_link": pdf_url, | |
| "pdf_title": pdf_title, | |
| "chunks": [] | |
| } | |
| def preprocess_pdfs_from_csv(csv_path, max_docs=None): | |
| metadata = pd.read_csv(csv_path) | |
| if max_docs: | |
| metadata = metadata.head(max_docs) | |
| chunks_data = [] | |
| faiss_index_start = 0 | |
| for _, row in metadata.iterrows(): | |
| doc_id = row["Id"] | |
| pdf_title = row["Nom du document"] | |
| pdf_url = row["Lien"] | |
| chunk_data = extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start) | |
| chunks_data.append(chunk_data) | |
| faiss_index_start += len(chunk_data["chunks"]) | |
| return chunks_data | |