Your Name commited on
Commit
df786c4
ยท
1 Parent(s): dbaf019

Update app.py and requirements.txt with PDF extraction support

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -29,14 +29,18 @@ dataset = load_dataset("dgmos/ericsson-manuals", split="train")
29
  # 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
30
  docs = []
31
  for item in dataset:
32
- # ์‹ค์ œ ํ•„๋“œ๋ช… ํ™•์ธ ํ•„์š” (์˜ˆ: "file", "path")
33
- pdf_path = item["file"]
 
 
 
34
 
35
  try:
36
  with pdfplumber.open(pdf_path) as pdf:
37
  text = "
38
  ".join([page.extract_text() or "" for page in pdf.pages])
39
- docs.append({"page_content": text})
 
40
  except Exception as e:
41
  print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
42
  continue
 
29
  # 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
30
  docs = []
31
  for item in dataset:
32
+ # ์‹ค์ œ ํ•„๋“œ๋ช… ํ™•์ธ ํ•„์š” (์˜ˆ: "file", "path" ๋“ฑ)
33
+ pdf_path = item.get("file") or item.get("path") or None
34
+ if not pdf_path:
35
+ print(f"โš ๏ธ PDF ๊ฒฝ๋กœ๊ฐ€ ์—†์Œ: {item}")
36
+ continue
37
 
38
  try:
39
  with pdfplumber.open(pdf_path) as pdf:
40
  text = "
41
  ".join([page.extract_text() or "" for page in pdf.pages])
42
+ if text.strip():
43
+ docs.append({"page_content": text})
44
  except Exception as e:
45
  print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
46
  continue