gbrabbit commited on
Commit
6d190ed
Β·
1 Parent(s): 88e4071

Auto commit at 25-2025-08 19:10:52

Browse files
Files changed (1) hide show
  1. lily_llm_core/document_processor.py +17 -0
lily_llm_core/document_processor.py CHANGED
@@ -40,6 +40,23 @@ except ImportError:
40
 
41
  logger = logging.getLogger(__name__)
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # 싀무 μˆ˜μ€€ PDF ꡬ쑰 뢄석을 μœ„ν•œ 데이터 ν΄λž˜μŠ€λ“€
44
  @dataclass
45
  class BoundingBox:
 
40
 
41
  logger = logging.getLogger(__name__)
42
 
43
+ # NLTK 데이터 경둜λ₯Ό μ•ˆμ „ν•œ μΊμ‹œ λ””λ ‰ν„°λ¦¬λ‘œ κ°•μ œ μ„€μ •
44
+ try:
45
+ NLTK_DATA = os.getenv('NLTK_DATA') or '/app/cache/nltk_data'
46
+ os.makedirs(NLTK_DATA, exist_ok=True)
47
+ os.environ['NLTK_DATA'] = NLTK_DATA
48
+ # μ˜΅μ…˜: ν•„μš”ν•œ λ¦¬μ†ŒμŠ€λ₯Ό 미리 λ‹€μš΄λ‘œλ“œ (stopwords λ“± μ‚¬μš© μ‹œ)
49
+ try:
50
+ import nltk # type: ignore
51
+ # 예: nltk.data.find('tokenizers/punkt')
52
+ # ν•„μš” μžμ› μ ‘κ·Ό μ‹œμ—λ§Œ λ‹€μš΄λ‘œλ“œν•˜λ„λ‘ 주석 처리 μœ μ§€
53
+ pass
54
+ except Exception:
55
+ pass
56
+ logger.info(f"πŸ”§ NLTK_DATA μ„€μ •: {NLTK_DATA}")
57
+ except Exception as _nltk_e:
58
+ logger.warning(f"⚠️ NLTK_DATA μ„€μ • μ‹€νŒ¨: {_nltk_e}")
59
+
60
  # 싀무 μˆ˜μ€€ PDF ꡬ쑰 뢄석을 μœ„ν•œ 데이터 ν΄λž˜μŠ€λ“€
61
  @dataclass
62
  class BoundingBox: