Spaces:
Runtime error
Runtime error
Create extract_text.py
Browse files- extract_text.py +22 -0
extract_text.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import PyPDF2
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
# Load the medical book PDF
|
| 5 |
+
pdf_path = "gale_of_medicin.pdf"
|
| 6 |
+
|
| 7 |
+
def extract_text(pdf_path):
|
| 8 |
+
with open(pdf_path, "rb") as file:
|
| 9 |
+
reader = PyPDF2.PdfReader(file)
|
| 10 |
+
text = ""
|
| 11 |
+
for page in reader.pages:
|
| 12 |
+
text += page.extract_text() + "\n"
|
| 13 |
+
return text
|
| 14 |
+
|
| 15 |
+
# Extract and save text as JSON dataset
|
| 16 |
+
text_data = extract_text(pdf_path)
|
| 17 |
+
|
| 18 |
+
dataset = [{"prompt": "Medical Query", "response": text_data}]
|
| 19 |
+
with open("medical_dataset.json", "w", encoding="utf-8") as f:
|
| 20 |
+
json.dump(dataset, f, ensure_ascii=False, indent=4)
|
| 21 |
+
|
| 22 |
+
print("β
Extracted text saved as `medical_dataset.json`")
|