ABDALLAH31 commited on
Commit
44a60f7
·
verified ·
1 Parent(s): 9555d24

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +12 -0
utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ import re
3
+
4
+ def extract_text_from_pdf(file_obj):
5
+ text = ""
6
+ with pdfplumber.open(file_obj) as pdf:
7
+ for page in pdf.pages:
8
+ text += page.extract_text() or ""
9
+ return text
10
+
11
+ def simple_clause_split(text):
12
+ return [s.strip() for s in re.split(r'(?<=[.?!])\s+', text) if s.strip()]