Spaces:
Build error
Build error
Your Name commited on
Commit ยท
df786c4
1
Parent(s): dbaf019
Update app.py and requirements.txt with PDF extraction support
Browse files
app.py
CHANGED
|
@@ -29,14 +29,18 @@ dataset = load_dataset("dgmos/ericsson-manuals", split="train")
|
|
| 29 |
# 4. PDF ํ
์คํธ ์ถ์ถ
|
| 30 |
docs = []
|
| 31 |
for item in dataset:
|
| 32 |
-
# ์ค์ ํ๋๋ช
ํ์ธ ํ์ (์: "file", "path")
|
| 33 |
-
pdf_path = item
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
try:
|
| 36 |
with pdfplumber.open(pdf_path) as pdf:
|
| 37 |
text = "
|
| 38 |
".join([page.extract_text() or "" for page in pdf.pages])
|
| 39 |
-
|
|
|
|
| 40 |
except Exception as e:
|
| 41 |
print(f"PDF ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {pdf_path} - {str(e)}")
|
| 42 |
continue
|
|
|
|
| 29 |
# 4. PDF ํ
์คํธ ์ถ์ถ
|
| 30 |
docs = []
|
| 31 |
for item in dataset:
|
| 32 |
+
# ์ค์ ํ๋๋ช
ํ์ธ ํ์ (์: "file", "path" ๋ฑ)
|
| 33 |
+
pdf_path = item.get("file") or item.get("path") or None
|
| 34 |
+
if not pdf_path:
|
| 35 |
+
print(f"โ ๏ธ PDF ๊ฒฝ๋ก๊ฐ ์์: {item}")
|
| 36 |
+
continue
|
| 37 |
|
| 38 |
try:
|
| 39 |
with pdfplumber.open(pdf_path) as pdf:
|
| 40 |
text = "
|
| 41 |
".join([page.extract_text() or "" for page in pdf.pages])
|
| 42 |
+
if text.strip():
|
| 43 |
+
docs.append({"page_content": text})
|
| 44 |
except Exception as e:
|
| 45 |
print(f"PDF ์ฒ๋ฆฌ ์ค ์ค๋ฅ: {pdf_path} - {str(e)}")
|
| 46 |
continue
|