Spaces:

muyeong
/

119_ChatBot_v2

Sleeping

App Files Files Community

119_ChatBot_v2 / pdf_processor.py

muyeong

Upload pdf_processor.py with huggingface_hub

8e0c1c9 verified 2 months ago

raw

history blame contribute delete

2.75 kB

	"""
	PDF 처리 모듈
	- PDF 텍스트 추출
	- 텍스트 청킹
	- 임베딩 생성 및 저장
	"""

	import fitz # PyMuPDF
	from typing import List, Dict
	from rag import get_embedding
	from database import save_document


	def extract_text_from_pdf(pdf_path: str) -> str:
	"""PDF에서 텍스트 추출"""
	try:
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	doc.close()
	return text.strip()
	except Exception as e:
	raise Exception(f"PDF 텍스트 추출 실패: {str(e)}")


	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	"""텍스트를 청크로 분할"""
	if not text:
	return []

	chunks = []
	start = 0
	text_length = len(text)

	while start < text_length:
	end = start + chunk_size

	# 문장 끝에서 자르기 시도
	if end < text_length:
	# 마침표, 물음표, 느낌표 찾기
	for punct in ['. ', '? ', '! ', '\n\n', '\n']:
	last_punct = text.rfind(punct, start, end)
	if last_punct != -1:
	end = last_punct + 1
	break

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - overlap if end < text_length else text_length

	return chunks


	def process_pdf(
	pdf_path: str,
	title: str,
	category: str = "일반",
	chunk_size: int = 500
	) -> Dict:
	"""
	PDF 처리 파이프라인
	1. 텍스트 추출
	2. 청킹
	3. 임베딩 생성
	4. DB 저장
	"""
	result = {
	"success": False,
	"message": "",
	"chunks_count": 0
	}

	try:
	# 1. 텍스트 추출
	text = extract_text_from_pdf(pdf_path)
	if not text:
	result["message"] = "PDF에서 텍스트를 추출할 수 없습니다."
	return result

	# 2. 청킹
	chunks = chunk_text(text, chunk_size=chunk_size)
	if not chunks:
	result["message"] = "텍스트 분할에 실패했습니다."
	return result

	# 3. 각 청크 임베딩 및 저장
	saved_count = 0
	for i, chunk in enumerate(chunks):
	chunk_title = f"{title} (Part {i+1})"
	embedding = get_embedding(chunk)

	if save_document(chunk_title, chunk, embedding, category):
	saved_count += 1

	result["success"] = True
	result["chunks_count"] = saved_count
	result["message"] = f"✅ {saved_count}개 청크가 성공적으로 저장되었습니다."

	except Exception as e:
	result["message"] = f"❌ 처리 중 오류 발생: {str(e)}"

	return result