Your Name commited on
Commit
dbaf019
ยท
1 Parent(s): 3055700

Update app.py and requirements.txt with PDF extraction support

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -29,15 +29,14 @@ dataset = load_dataset("dgmos/ericsson-manuals", split="train")
29
  # 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
30
  docs = []
31
  for item in dataset:
32
- # ์‹ค์ œ ํ•„๋“œ๋ช… ํ™•์ธ (์˜ˆ: "file", "path")
33
- pdf_path = item["file"] # โ† ์—ฌ๊ธฐ์„œ ์‹ค์ œ ํ•„๋“œ๋ช…์„ ํ™•์ธํ•˜์„ธ์š”!
34
-
35
- # PDF ์—ด๊ธฐ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
36
  try:
37
  with pdfplumber.open(pdf_path) as pdf:
38
  text = "
39
  ".join([page.extract_text() or "" for page in pdf.pages])
40
- docs.append(text)
41
  except Exception as e:
42
  print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
43
  continue
 
29
  # 4. PDF ํ…์ŠคํŠธ ์ถ”์ถœ
30
  docs = []
31
  for item in dataset:
32
+ # ์‹ค์ œ ํ•„๋“œ๋ช… ํ™•์ธ ํ•„์š” (์˜ˆ: "file", "path")
33
+ pdf_path = item["file"]
34
+
 
35
  try:
36
  with pdfplumber.open(pdf_path) as pdf:
37
  text = "
38
  ".join([page.extract_text() or "" for page in pdf.pages])
39
+ docs.append({"page_content": text})
40
  except Exception as e:
41
  print(f"PDF ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜: {pdf_path} - {str(e)}")
42
  continue