neerajkalyank commited on
Commit
c1f68a4
·
verified ·
1 Parent(s): b0f9365

Create ingestion.py

Browse files
Files changed (1) hide show
  1. ingestion.py +10 -0
ingestion.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ import docx2txt
3
+
4
+ def read_file(file):
5
+ if file.name.endswith(".pdf"):
6
+ reader = PdfReader(file)
7
+ return " ".join(p.extract_text() for p in reader.pages)
8
+ if file.name.endswith(".docx"):
9
+ return docx2txt.process(file)
10
+ return file.read().decode("utf-8")