kwmin_probin / utils /pdf_utils.py
cksleigen's picture
add files
c2f0e66
# utils/pdf_utils.py
"""PDF ํ…์ŠคํŠธ ์ขŒํ‘œ ์ถ”์ถœ ์œ ํ‹ธ"""
import fitz # PyMuPDF
from typing import List, Dict
def get_text_coordinates(pdf_path: str, page_num: int, search_text: str) -> List[Dict]:
"""
PDF์—์„œ ํŠน์ • ํ…์ŠคํŠธ์˜ ์ขŒํ‘œ๋ฅผ ์ฐพ์•„ ํ•˜์ด๋ผ์ดํŠธ ์ •๋ณด ๋ฐ˜ํ™˜
Args:
pdf_path: PDF ํŒŒ์ผ ๊ฒฝ๋กœ
page_num: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ (1-based)
search_text: ๊ฒ€์ƒ‰ํ•  ํ…์ŠคํŠธ
Returns:
ํ•˜์ด๋ผ์ดํŠธ ์ •๋ณด ๋ฆฌ์ŠคํŠธ
"""
try:
doc = fitz.open(pdf_path)
page = doc[page_num - 1]
# ๊ฒ€์ƒ‰ ํ…์ŠคํŠธ ์ •๋ฆฌ (๋„ˆ๋ฌด ๊ธธ๋ฉด ์•ž๋ถ€๋ถ„๋งŒ)
search_query = search_text[:100].strip()
text_instances = page.search_for(search_query)
annotations = []
for rect in text_instances:
# streamlit-pdf-viewer ํ˜•์‹ - ํ˜•๊ด‘ํŽœ ์Šคํƒ€์ผ
annotations.append({
"page": page_num, # 1-based index
"x": rect.x0,
"y": rect.y0,
"width": rect.x1 - rect.x0,
"height": rect.y1 - rect.y0,
"color": "#FFFF00", # ๋ฐ์€ ๋…ธ๋ž€์ƒ‰
"opacity": 0.4 # 40% ํˆฌ๋ช…๋„ (ํ˜•๊ด‘ํŽœ ํšจ๊ณผ)
})
doc.close()
if annotations:
print(f" โœ… {len(annotations)}๊ฐœ ํ•˜์ด๋ผ์ดํŠธ ์ƒ์„ฑ (ํŽ˜์ด์ง€ {page_num})")
else:
print(f" โš ๏ธ ํ…์ŠคํŠธ '{search_query[:30]}...' ์ฐพ์ง€ ๋ชปํ•จ")
return annotations
except Exception as e:
print(f"โŒ ํ•˜์ด๋ผ์ดํŠธ ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}")
return []
if __name__ == "__main__":
result = get_text_coordinates("test.pdf", 1, "sample text")
print(f"ํ•˜์ด๋ผ์ดํŠธ ๊ฐœ์ˆ˜: {len(result)}")