File size: 3,501 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from src.utils.helpers import clean_text, chunk_text
from src.configs.config import MAX_TOKENS, LOG_DIR
from src.utils.helpers import clean_text, chunk_text
from src.configs.config import MAX_TOKENS, LOG_DIR
import requests
import fitz  # PyMuPDF
from io import BytesIO
import pandas as pd
import logging
import os

LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

def extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start):
    """Extract, clean, and chunk PDF text directly from URL (no local file)."""
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()

        all_text = ""
        with fitz.open(stream=BytesIO(response.content), filetype="pdf") as doc:
            for page in doc:
                text = page.get_text("text")
                if text:
                    text = text.encode('utf-8', errors='ignore').decode('utf-8')
                    all_text += text + "\n"

                # Table extraction
                try:
                    table_finder = page.find_tables()
                    if table_finder.tables:
                        logging.info(f"Tables detected on page {page.number}: {len(table_finder.tables)}")
                    for table in table_finder.tables:
                        for row in table.cells:
                            row_text = []
                            for cell in row:
                                if cell and isinstance(cell, str):
                                    cell = cell.encode('utf-8', errors='ignore').decode('utf-8')
                                    row_text.append(cell)
                                else:
                                    row_text.append("")
                            table_row = " | ".join(row_text)
                            all_text += table_row + "\n"
                        all_text += "\n"
                except Exception as table_error:
                    logging.warning(f"Table extraction failed for page {page.number} in {pdf_title}: {table_error}")

        cleaned_text = clean_text(all_text)
        chunks = chunk_text(cleaned_text, max_tokens=MAX_TOKENS)

        chunk_data = [
            {
                "faiss_index": faiss_index_start + i,
                "text": chunk
            }
            for i, chunk in enumerate(chunks)
        ]

        logging.info(f"Processed {pdf_title} from URL: {len(chunks)} chunks")
        return {
            "Id": doc_id,
            "download_link": pdf_url,
            "pdf_title": pdf_title,
            "chunks": chunk_data
        }

    except Exception as e:
        logging.error(f"Error processing {pdf_title} from URL {pdf_url}: {e}")
        return {
            "Id": doc_id,
            "download_link": pdf_url,
            "pdf_title": pdf_title,
            "chunks": []
        }


def preprocess_pdfs_from_csv(csv_path, max_docs=None):
    metadata = pd.read_csv(csv_path)
    if max_docs:
        metadata = metadata.head(max_docs)

    chunks_data = []
    faiss_index_start = 0

    for _, row in metadata.iterrows():
        doc_id = row["Id"]
        pdf_title = row["Nom du document"]
        pdf_url = row["Lien"]
        
        chunk_data = extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start)
        chunks_data.append(chunk_data)
        faiss_index_start += len(chunk_data["chunks"])

    return chunks_data