rag-pdf-chat / rag_system /pdf_loader.py
Mehriddin1997's picture
clean project
b8f0598
"""
PDF Loader Moduli
=================
pypdf kutubxonasi yordamida PDF fayllardan matnni ajratib oladi.
Ushbu modul PDF hujjatlaridan matnni olish uchun oddiy interfeys taqdim etadi.
Olingan matn bitta string sifatida qaytariladi va keyinchalik chunker tomonidan ishlov beriladi.
"""
from pypdf import PdfReader
from typing import Optional
class PDFLoader:
"""
PDF hujjatlaridan matn olish uchun klass.
Ushbu modul RAG pipeline’ning birinchi bosqichi:
- PDF fayllardan xom matnni olamiz
- Keyin bu matnni chunk’lab, embedding hosil qilamiz
Misol:
loader = PDFLoader()
text = loader.load("document.pdf")
"""
def __init__(self):
"""PDF loader’ni ishga tushiradi (hech qanday parametr kerak emas)."""
pass
def load(self, pdf_path: str) -> str:
"""
PDF faylni yuklab, matnini ajratib oladi.
Args:
pdf_path: Yuklanadigan PDF fayl manzili.
Returns:
PDF faylning to‘liq matni bitta string sifatida.
Sahifalar yangi qatordan (\n) ajratiladi.
Raises:
FileNotFoundError: Agar fayl mavjud bo‘lmasa.
Exception: Agar PDF o‘qib bo‘lmasa yoki parsingda xato yuz bersa.
"""
# PDF faylni ochamiz va o‘qiymiz
reader = PdfReader(pdf_path)
# Har bir sahifadan matnni ajratamiz
text_content = []
for page_num, page in enumerate(reader.pages):
page_text = page.extract_text() # Sahifadan matnni olish
if page_text:
text_content.append(page_text)
# Sahifalarni bitta stringga qo‘shamiz
full_text = "\n".join(text_content)
return full_text
def load_with_metadata(self, pdf_path: str) -> dict:
"""
PDF faylni yuklab, matn va metadata bilan qaytaradi.
Args:
pdf_path: PDF fayl manzili.
Returns:
Lug‘at (dict) shaklida natija:
- 'text': To‘liq matn
- 'num_pages': Sahifalar soni
- 'metadata': PDF metadata (sarlavha, muallif va boshqalar)
"""
reader = PdfReader(pdf_path)
# Barcha sahifalardan matnni ajratamiz
text_content = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
# PDF metadata olish
metadata = {}
if reader.metadata:
metadata = {
'title': reader.metadata.get('/Title', ''),
'author': reader.metadata.get('/Author', ''),
'subject': reader.metadata.get('/Subject', ''),
'creator': reader.metadata.get('/Creator', ''),
}
return {
'text': "\n".join(text_content),
'num_pages': len(reader.pages),
'metadata': metadata
}
# Namuna sifatida ishlatish (test)
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
loader = PDFLoader()
text = loader.load(sys.argv[1])
print(f"PDF’dan {len(text)} ta belgilar ajratildi")
print(f"Matnning 500 belgigacha preview: {text[:500]}...")