Spaces:
Sleeping
Sleeping
Auto commit at 25-2025-08 19:10:52
Browse files
lily_llm_core/document_processor.py
CHANGED
|
@@ -40,6 +40,23 @@ except ImportError:
|
|
| 40 |
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# μ€λ¬΄ μμ€ PDF ꡬ쑰 λΆμμ μν λ°μ΄ν° ν΄λμ€λ€
|
| 44 |
@dataclass
|
| 45 |
class BoundingBox:
|
|
|
|
| 40 |
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
+
# NLTK λ°μ΄ν° κ²½λ‘λ₯Ό μμ ν μΊμ λλ ν°λ¦¬λ‘ κ°μ μ€μ
|
| 44 |
+
try:
|
| 45 |
+
NLTK_DATA = os.getenv('NLTK_DATA') or '/app/cache/nltk_data'
|
| 46 |
+
os.makedirs(NLTK_DATA, exist_ok=True)
|
| 47 |
+
os.environ['NLTK_DATA'] = NLTK_DATA
|
| 48 |
+
# μ΅μ
: νμν 리μμ€λ₯Ό 미리 λ€μ΄λ‘λ (stopwords λ± μ¬μ© μ)
|
| 49 |
+
try:
|
| 50 |
+
import nltk # type: ignore
|
| 51 |
+
# μ: nltk.data.find('tokenizers/punkt')
|
| 52 |
+
# νμ μμ μ κ·Ό μμλ§ λ€μ΄λ‘λνλλ‘ μ£Όμ μ²λ¦¬ μ μ§
|
| 53 |
+
pass
|
| 54 |
+
except Exception:
|
| 55 |
+
pass
|
| 56 |
+
logger.info(f"π§ NLTK_DATA μ€μ : {NLTK_DATA}")
|
| 57 |
+
except Exception as _nltk_e:
|
| 58 |
+
logger.warning(f"β οΈ NLTK_DATA μ€μ μ€ν¨: {_nltk_e}")
|
| 59 |
+
|
| 60 |
# μ€λ¬΄ μμ€ PDF ꡬ쑰 λΆμμ μν λ°μ΄ν° ν΄λμ€λ€
|
| 61 |
@dataclass
|
| 62 |
class BoundingBox:
|