Spaces:

zinoubm
/

TwitterTosChatBot

No application file

zinoubm commited on Feb 23, 2023

Commit

db68fe3

1 Parent(s): fb30d5c

completing the scraping script

Files changed (2) hide show

.gitignore CHANGED Viewed

@@ -127,3 +127,6 @@ dmypy.json
 # Pyre type checker
 .pyre/

 # Pyre type checker
 .pyre/
+# the assets
+documents/

document_scraping/scrape.py ADDED Viewed

+import pdfplumber
+from pathlib import Path
+import os
+input_path = Path("./documents")
+file_names = os.listdir(input_path)
+result = ""
+for file_name in file_names:
+    pdf = pdfplumber.open(input_path / file_name)
+    for page in pdf.pages:
+        text = page.extract_text()
+        result += text
+# encoding to ASCII will remove special caracters.
+result = result.encode(encoding="ASCII", errors="ignore").decode()
+with open(input_path / "result.txt", "w") as f:
+    f.write(result)