Spaces:
Sleeping
Sleeping
| """ | |
| PDF Loader Moduli | |
| ================= | |
| pypdf kutubxonasi yordamida PDF fayllardan matnni ajratib oladi. | |
| Ushbu modul PDF hujjatlaridan matnni olish uchun oddiy interfeys taqdim etadi. | |
| Olingan matn bitta string sifatida qaytariladi va keyinchalik chunker tomonidan ishlov beriladi. | |
| """ | |
| from pypdf import PdfReader | |
| from typing import Optional | |
| class PDFLoader: | |
| """ | |
| PDF hujjatlaridan matn olish uchun klass. | |
| Ushbu modul RAG pipeline’ning birinchi bosqichi: | |
| - PDF fayllardan xom matnni olamiz | |
| - Keyin bu matnni chunk’lab, embedding hosil qilamiz | |
| Misol: | |
| loader = PDFLoader() | |
| text = loader.load("document.pdf") | |
| """ | |
| def __init__(self): | |
| """PDF loader’ni ishga tushiradi (hech qanday parametr kerak emas).""" | |
| pass | |
| def load(self, pdf_path: str) -> str: | |
| """ | |
| PDF faylni yuklab, matnini ajratib oladi. | |
| Args: | |
| pdf_path: Yuklanadigan PDF fayl manzili. | |
| Returns: | |
| PDF faylning to‘liq matni bitta string sifatida. | |
| Sahifalar yangi qatordan (\n) ajratiladi. | |
| Raises: | |
| FileNotFoundError: Agar fayl mavjud bo‘lmasa. | |
| Exception: Agar PDF o‘qib bo‘lmasa yoki parsingda xato yuz bersa. | |
| """ | |
| # PDF faylni ochamiz va o‘qiymiz | |
| reader = PdfReader(pdf_path) | |
| # Har bir sahifadan matnni ajratamiz | |
| text_content = [] | |
| for page_num, page in enumerate(reader.pages): | |
| page_text = page.extract_text() # Sahifadan matnni olish | |
| if page_text: | |
| text_content.append(page_text) | |
| # Sahifalarni bitta stringga qo‘shamiz | |
| full_text = "\n".join(text_content) | |
| return full_text | |
| def load_with_metadata(self, pdf_path: str) -> dict: | |
| """ | |
| PDF faylni yuklab, matn va metadata bilan qaytaradi. | |
| Args: | |
| pdf_path: PDF fayl manzili. | |
| Returns: | |
| Lug‘at (dict) shaklida natija: | |
| - 'text': To‘liq matn | |
| - 'num_pages': Sahifalar soni | |
| - 'metadata': PDF metadata (sarlavha, muallif va boshqalar) | |
| """ | |
| reader = PdfReader(pdf_path) | |
| # Barcha sahifalardan matnni ajratamiz | |
| text_content = [] | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_content.append(page_text) | |
| # PDF metadata olish | |
| metadata = {} | |
| if reader.metadata: | |
| metadata = { | |
| 'title': reader.metadata.get('/Title', ''), | |
| 'author': reader.metadata.get('/Author', ''), | |
| 'subject': reader.metadata.get('/Subject', ''), | |
| 'creator': reader.metadata.get('/Creator', ''), | |
| } | |
| return { | |
| 'text': "\n".join(text_content), | |
| 'num_pages': len(reader.pages), | |
| 'metadata': metadata | |
| } | |
| # Namuna sifatida ishlatish (test) | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) > 1: | |
| loader = PDFLoader() | |
| text = loader.load(sys.argv[1]) | |
| print(f"PDF’dan {len(text)} ta belgilar ajratildi") | |
| print(f"Matnning 500 belgigacha preview: {text[:500]}...") | |