anuradhakoppala commited on
Commit
facf78f
·
verified ·
1 Parent(s): ddbc9ba

Update summarizer_service.py

Browse files
Files changed (1) hide show
  1. summarizer_service.py +18 -3
summarizer_service.py CHANGED
@@ -1,10 +1,12 @@
1
  from transformers import T5ForConditionalGeneration, T5Tokenizer
2
  from flask import Flask, request, jsonify
 
3
  import pytesseract
4
  from pdf2image import convert_from_path
5
  import docx
 
 
6
  import os
7
- from dotenv import load_dotenv
8
 
9
  load_dotenv()
10
  app = Flask(__name__)
@@ -14,14 +16,27 @@ model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
14
  tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
15
 
16
  def extract_text(file_path, file_type):
 
 
17
  if file_type == 'pdf':
18
  images = convert_from_path(file_path)
19
- text = ''.join(pytesseract.image_to_string(image) for image in images)
 
 
 
 
 
20
  elif file_type == 'docx':
21
  doc = docx.Document(file_path)
22
- text = '\n'.join(p.text for p in doc.paragraphs)
 
 
 
 
 
23
  else:
24
  raise ValueError('Unsupported file type')
 
25
  return text
26
 
27
  def summarize_contract(text, aspect=None):
 
1
  from transformers import T5ForConditionalGeneration, T5Tokenizer
2
  from flask import Flask, request, jsonify
3
+ from dotenv import load_dotenv
4
  import pytesseract
5
  from pdf2image import convert_from_path
6
  import docx
7
+ import pandas as pd
8
+ from PIL import Image
9
  import os
 
10
 
11
  load_dotenv()
12
  app = Flask(__name__)
 
16
  tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
17
 
18
  def extract_text(file_path, file_type):
19
+ file_type = file_type.lower()
20
+
21
  if file_type == 'pdf':
22
  images = convert_from_path(file_path)
23
+ text = ''.join(pytesseract.image_to_string(img) for img in images)
24
+
25
+ elif file_type in ['jpg', 'jpeg', 'png']:
26
+ image = Image.open(file_path)
27
+ text = pytesseract.image_to_string(image)
28
+
29
  elif file_type == 'docx':
30
  doc = docx.Document(file_path)
31
+ text = '\n'.join([p.text for p in doc.paragraphs])
32
+
33
+ elif file_type in ['csv', 'xls', 'xlsx']:
34
+ df = pd.read_excel(file_path) if file_type in ['xls', 'xlsx'] else pd.read_csv(file_path)
35
+ text = df.to_string(index=False)
36
+
37
  else:
38
  raise ValueError('Unsupported file type')
39
+
40
  return text
41
 
42
  def summarize_contract(text, aspect=None):