Shami96 commited on
Commit
f345bcf
·
verified ·
1 Parent(s): ce482c1

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +4 -3
utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import pdfplumber
2
  from docx import Document
3
- import os
 
4
 
5
  def extract_text(file):
6
  if file.name.endswith(".pdf"):
@@ -12,12 +13,12 @@ def extract_text(file):
12
  elif file.name.endswith(".txt"):
13
  return file.read().decode("utf-8")
14
  elif file.name.endswith(".html"):
15
- return file.read().decode("utf-8")
 
16
  else:
17
  return file.read().decode("utf-8")
18
 
19
  def render_template(template_path, context):
20
- from jinja2 import Environment, FileSystemLoader
21
  env = Environment(loader=FileSystemLoader("templates"))
22
  template = env.get_template(template_path)
23
  return template.render(context)
 
1
  import pdfplumber
2
  from docx import Document
3
+ from bs4 import BeautifulSoup
4
+ from jinja2 import Environment, FileSystemLoader
5
 
6
  def extract_text(file):
7
  if file.name.endswith(".pdf"):
 
13
  elif file.name.endswith(".txt"):
14
  return file.read().decode("utf-8")
15
  elif file.name.endswith(".html"):
16
+ soup = BeautifulSoup(file.read(), "html.parser")
17
+ return soup.get_text()
18
  else:
19
  return file.read().decode("utf-8")
20
 
21
  def render_template(template_path, context):
 
22
  env = Environment(loader=FileSystemLoader("templates"))
23
  template = env.get_template(template_path)
24
  return template.render(context)