Spaces:
Sleeping
Sleeping
File size: 3,501 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
from src.utils.helpers import clean_text, chunk_text
from src.configs.config import MAX_TOKENS, LOG_DIR
from src.utils.helpers import clean_text, chunk_text
from src.configs.config import MAX_TOKENS, LOG_DIR
import requests
import fitz # PyMuPDF
from io import BytesIO
import pandas as pd
import logging
import os
LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
def extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start):
"""Extract, clean, and chunk PDF text directly from URL (no local file)."""
try:
response = requests.get(pdf_url)
response.raise_for_status()
all_text = ""
with fitz.open(stream=BytesIO(response.content), filetype="pdf") as doc:
for page in doc:
text = page.get_text("text")
if text:
text = text.encode('utf-8', errors='ignore').decode('utf-8')
all_text += text + "\n"
# Table extraction
try:
table_finder = page.find_tables()
if table_finder.tables:
logging.info(f"Tables detected on page {page.number}: {len(table_finder.tables)}")
for table in table_finder.tables:
for row in table.cells:
row_text = []
for cell in row:
if cell and isinstance(cell, str):
cell = cell.encode('utf-8', errors='ignore').decode('utf-8')
row_text.append(cell)
else:
row_text.append("")
table_row = " | ".join(row_text)
all_text += table_row + "\n"
all_text += "\n"
except Exception as table_error:
logging.warning(f"Table extraction failed for page {page.number} in {pdf_title}: {table_error}")
cleaned_text = clean_text(all_text)
chunks = chunk_text(cleaned_text, max_tokens=MAX_TOKENS)
chunk_data = [
{
"faiss_index": faiss_index_start + i,
"text": chunk
}
for i, chunk in enumerate(chunks)
]
logging.info(f"Processed {pdf_title} from URL: {len(chunks)} chunks")
return {
"Id": doc_id,
"download_link": pdf_url,
"pdf_title": pdf_title,
"chunks": chunk_data
}
except Exception as e:
logging.error(f"Error processing {pdf_title} from URL {pdf_url}: {e}")
return {
"Id": doc_id,
"download_link": pdf_url,
"pdf_title": pdf_title,
"chunks": []
}
def preprocess_pdfs_from_csv(csv_path, max_docs=None):
metadata = pd.read_csv(csv_path)
if max_docs:
metadata = metadata.head(max_docs)
chunks_data = []
faiss_index_start = 0
for _, row in metadata.iterrows():
doc_id = row["Id"]
pdf_title = row["Nom du document"]
pdf_url = row["Lien"]
chunk_data = extract_and_clean_pdf_from_url(pdf_url, pdf_title, doc_id, faiss_index_start)
chunks_data.append(chunk_data)
faiss_index_start += len(chunk_data["chunks"])
return chunks_data
|