lokesh341 commited on
Commit
04666f5
·
verified ·
1 Parent(s): f858ef3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -77
app.py CHANGED
@@ -1,12 +1,12 @@
1
- # app.py
2
  import re
3
  import logging
 
4
  from io import BytesIO
5
  from datetime import datetime
6
- from typing import List, Dict
7
  from flask import Flask, request, jsonify
8
  from PyPDF2 import PdfReader
9
- from sentence_transformers import SentenceTransformer, util
10
  from simple_salesforce import Salesforce
11
  import torch
12
 
@@ -16,8 +16,11 @@ logging.basicConfig(level=logging.INFO)
16
 
17
  class DocumentProcessor:
18
  def __init__(self):
 
 
 
19
  # Load lightweight sentence transformer model
20
- self.model = SentenceTransformer('all-MiniLM-L6-v2')
21
 
22
  # Define compliance criteria (customize these)
23
  self.compliance_requirements = {
@@ -38,106 +41,165 @@ class DocumentProcessor:
38
  ]
39
  }
40
 
41
- # Pre-compute requirement embeddings
42
- self.requirement_embeddings = {
43
- category: self.model.encode(requirements)
44
- for category, requirements in self.compliance_requirements.items()
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def extract_text(self, pdf_bytes: bytes) -> str:
48
  """Extract text from PDF document"""
49
- with BytesIO(pdf_bytes) as pdf_file:
50
- reader = PdfReader(pdf_file)
51
- return " ".join(page.extract_text() for page in reader.pages)
 
 
 
 
 
52
 
53
  def score_document(self, text: str) -> Dict:
54
  """Score document against compliance requirements"""
55
- # Split document into sentences
56
- sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
57
- sentence_embeddings = self.model.encode(sentences)
58
 
59
- results = {'categories': {}, 'score': 0}
60
- total_matches = 0
61
-
62
- for category, req_embeddings in self.requirement_embeddings.items():
63
- # Find matches between document and requirements
64
- matches = util.semantic_search(
65
- req_embeddings,
66
- sentence_embeddings,
67
- top_k=3,
68
- score_threshold=0.5
69
- )
70
 
71
- # Count valid matches
72
- valid_matches = sum(1 for m in matches if m[0]['score'] > 0.6)
73
- coverage = valid_matches / len(req_embeddings)
74
 
75
- results['categories'][category] = {
76
- 'coverage': coverage,
77
- 'matched_requirements': [
78
- self.compliance_requirements[category][i]
79
- for i, m in enumerate(matches)
80
- if m[0]['score'] > 0.6
81
- ],
82
- 'missing_requirements': [
83
- self.compliance_requirements[category][i]
84
- for i, m in enumerate(matches)
85
- if m[0]['score'] <= 0.6
86
- ]
87
- }
88
- total_matches += valid_matches
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # Calculate overall score (0-5 scale)
91
- total_requirements = sum(len(r) for r in self.compliance_requirements.values())
92
- results['score'] = min(5, round(5 * total_matches / total_requirements, 1))
 
 
 
 
93
 
94
- return results
 
 
 
95
 
96
  class SalesforceHandler:
97
  def __init__(self):
98
- self.sf = Salesforce(
99
- username='your_username',
100
- password='your_password',
101
- security_token='your_token',
102
- domain='login' # or 'test' for sandbox
103
- )
104
-
 
 
 
 
105
  def create_scorecard(self, vendor_id: str, results: Dict) -> Dict:
106
  """Create vendor scorecard in Salesforce"""
107
- record = {
108
- 'Vendor_Name__c': vendor_id,
109
- 'Score__c': results['score'],
110
- 'Evaluation_Date__c': datetime.now().isoformat(),
111
- 'Status__c': 'Evaluated',
112
- 'Details__c': self._format_details(results)
113
- }
114
-
115
  try:
 
 
 
 
 
 
 
 
 
116
  response = self.sf.Vendor_Scorecard__c.create(record)
117
  return {'success': True, 'id': response['id']}
118
  except Exception as e:
119
- logging.error(f"Salesforce error: {str(e)}")
120
  return {'success': False, 'error': str(e)}
121
-
122
  def _format_details(self, results: Dict) -> str:
123
  """Format evaluation details for Salesforce"""
 
 
 
124
  details = []
125
- for category, data in results['categories'].items():
126
  details.append(
127
  f"{category.upper()}:\n"
128
- f"Coverage: {data['coverage']:.0%}\n"
129
- f"Matched: {', '.join(data['matched_requirements'] or 'None')}\n"
130
- f"Missing: {', '.join(data['missing_requirements'] or 'None')}\n"
131
  )
132
- return "\n".join(details)
133
 
134
- # Initialize components
135
- processor = DocumentProcessor()
136
- sf_handler = SalesforceHandler()
 
 
 
 
 
137
 
138
  @app.route('/api/evaluate', methods=['POST'])
139
  def evaluate_document():
140
  """API endpoint for document evaluation"""
 
 
 
141
  if 'file' not in request.files:
142
  return jsonify({'error': 'No file provided'}), 400
143
 
@@ -154,15 +216,16 @@ def evaluate_document():
154
 
155
  if not sf_result['success']:
156
  return jsonify({
157
- 'error': f"Salesforce error: {sf_result['error']}",
158
  'results': results
159
  }), 500
160
 
161
  return jsonify({
162
  'success': True,
163
- 'score': results['score'],
164
- 'salesforce_id': sf_result['id'],
165
- 'evaluation': results['categories']
 
166
  })
167
 
168
  except Exception as e:
@@ -172,7 +235,20 @@ def evaluate_document():
172
  @app.route('/health', methods=['GET'])
173
  def health_check():
174
  """Health check endpoint"""
175
- return jsonify({'status': 'healthy', 'torch_available': torch.cuda.is_available()})
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  if __name__ == '__main__':
178
  app.run(host='0.0.0.0', port=5000)
 
 
1
  import re
2
  import logging
3
+ import numpy as np
4
  from io import BytesIO
5
  from datetime import datetime
6
+ from typing import List, Dict, Optional
7
  from flask import Flask, request, jsonify
8
  from PyPDF2 import PdfReader
9
+ from sentence_transformers import SentenceTransformer
10
  from simple_salesforce import Salesforce
11
  import torch
12
 
 
16
 
17
  class DocumentProcessor:
18
  def __init__(self):
19
+ # Verify numpy is properly installed
20
+ self._verify_numpy()
21
+
22
  # Load lightweight sentence transformer model
23
+ self.model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
24
 
25
  # Define compliance criteria (customize these)
26
  self.compliance_requirements = {
 
41
  ]
42
  }
43
 
44
+ # Pre-compute requirement embeddings with error handling
45
+ self.requirement_embeddings = {}
46
+ for category, requirements in self.compliance_requirements.items():
47
+ try:
48
+ embeddings = self.model.encode(requirements, convert_to_numpy=True)
49
+ self.requirement_embeddings[category] = embeddings
50
+ except Exception as e:
51
+ logging.error(f"Error encoding requirements for {category}: {str(e)}")
52
+ raise
53
+
54
+ def _verify_numpy(self):
55
+ """Verify numpy is working properly"""
56
+ try:
57
+ test_array = np.array([1, 2, 3])
58
+ assert test_array.sum() == 6
59
+ except Exception as e:
60
+ logging.error(f"NumPy verification failed: {str(e)}")
61
+ raise RuntimeError("NumPy is not functioning properly") from e
62
 
63
  def extract_text(self, pdf_bytes: bytes) -> str:
64
  """Extract text from PDF document"""
65
+ try:
66
+ with BytesIO(pdf_bytes) as pdf_file:
67
+ reader = PdfReader(pdf_file)
68
+ text = " ".join(page.extract_text() or "" for page in reader.pages)
69
+ return text.strip()
70
+ except Exception as e:
71
+ logging.error(f"PDF extraction error: {str(e)}")
72
+ raise RuntimeError("Failed to extract text from PDF") from e
73
 
74
  def score_document(self, text: str) -> Dict:
75
  """Score document against compliance requirements"""
76
+ if not text:
77
+ return {'error': 'Empty document text', 'score': 0, 'categories': {}}
 
78
 
79
+ try:
80
+ # Split document into meaningful chunks (not just sentences)
81
+ chunks = self._split_into_chunks(text)
82
+ chunk_embeddings = self.model.encode(chunks, convert_to_numpy=True)
 
 
 
 
 
 
 
83
 
84
+ results = {'categories': {}, 'score': 0}
85
+ total_matches = 0
86
+ total_possible = 0
87
 
88
+ for category, req_embeddings in self.requirement_embeddings.items():
89
+ # Calculate similarity between document chunks and requirements
90
+ similarity_matrix = np.inner(chunk_embeddings, req_embeddings)
91
+ max_similarities = np.max(similarity_matrix, axis=0)
92
+
93
+ # Count matches above threshold
94
+ matches = (max_similarities > 0.65).sum()
95
+ coverage = matches / len(req_embeddings)
96
+
97
+ results['categories'][category] = {
98
+ 'coverage': float(coverage), # Convert numpy float to Python float
99
+ 'matched_requirements': [
100
+ self.compliance_requirements[category][i]
101
+ for i, score in enumerate(max_similarities)
102
+ if score > 0.65
103
+ ],
104
+ 'missing_requirements': [
105
+ self.compliance_requirements[category][i]
106
+ for i, score in enumerate(max_similarities)
107
+ if score <= 0.65
108
+ ]
109
+ }
110
+ total_matches += matches
111
+ total_possible += len(req_embeddings)
112
+
113
+ # Calculate overall score (0-5 scale)
114
+ if total_possible > 0:
115
+ results['score'] = min(5.0, round(5 * total_matches / total_possible, 1))
116
+ return results
117
+
118
+ except Exception as e:
119
+ logging.error(f"Scoring error: {str(e)}")
120
+ return {'error': str(e), 'score': 0, 'categories': {}}
121
+
122
+ def _split_into_chunks(self, text: str, chunk_size: int = 500) -> List[str]:
123
+ """Split text into meaningful chunks of approximately chunk_size characters"""
124
+ words = text.split()
125
+ chunks = []
126
+ current_chunk = []
127
+ current_length = 0
128
 
129
+ for word in words:
130
+ if current_length + len(word) + 1 > chunk_size and current_chunk:
131
+ chunks.append(" ".join(current_chunk))
132
+ current_chunk = []
133
+ current_length = 0
134
+ current_chunk.append(word)
135
+ current_length += len(word) + 1
136
 
137
+ if current_chunk:
138
+ chunks.append(" ".join(current_chunk))
139
+
140
+ return chunks
141
 
142
  class SalesforceHandler:
143
  def __init__(self):
144
+ try:
145
+ self.sf = Salesforce(
146
+ username='your_username',
147
+ password='your_password',
148
+ security_token='your_token',
149
+ domain='login' # or 'test' for sandbox
150
+ )
151
+ except Exception as e:
152
+ logging.error(f"Salesforce connection error: {str(e)}")
153
+ raise
154
+
155
  def create_scorecard(self, vendor_id: str, results: Dict) -> Dict:
156
  """Create vendor scorecard in Salesforce"""
 
 
 
 
 
 
 
 
157
  try:
158
+ record = {
159
+ 'Vendor_Name__c': vendor_id,
160
+ 'Score__c': results.get('score', 0),
161
+ 'Evaluation_Date__c': datetime.now().isoformat(),
162
+ 'Status__c': 'Evaluated',
163
+ 'Details__c': self._format_details(results),
164
+ 'Error__c': results.get('error', '')
165
+ }
166
+
167
  response = self.sf.Vendor_Scorecard__c.create(record)
168
  return {'success': True, 'id': response['id']}
169
  except Exception as e:
170
+ logging.error(f"Salesforce create error: {str(e)}")
171
  return {'success': False, 'error': str(e)}
172
+
173
  def _format_details(self, results: Dict) -> str:
174
  """Format evaluation details for Salesforce"""
175
+ if 'error' in results:
176
+ return f"Error: {results['error']}"
177
+
178
  details = []
179
+ for category, data in results.get('categories', {}).items():
180
  details.append(
181
  f"{category.upper()}:\n"
182
+ f"Coverage: {data.get('coverage', 0):.0%}\n"
183
+ f"Matched: {', '.join(data.get('matched_requirements', ['None']))}\n"
184
+ f"Missing: {', '.join(data.get('missing_requirements', ['None']))}\n"
185
  )
186
+ return "\n".join(details) if details else "No evaluation details available"
187
 
188
+ # Initialize components with error handling
189
+ try:
190
+ processor = DocumentProcessor()
191
+ sf_handler = SalesforceHandler()
192
+ except Exception as e:
193
+ logging.error(f"Initialization failed: {str(e)}")
194
+ processor = None
195
+ sf_handler = None
196
 
197
  @app.route('/api/evaluate', methods=['POST'])
198
  def evaluate_document():
199
  """API endpoint for document evaluation"""
200
+ if not processor or not sf_handler:
201
+ return jsonify({'error': 'Service initialization failed'}), 500
202
+
203
  if 'file' not in request.files:
204
  return jsonify({'error': 'No file provided'}), 400
205
 
 
216
 
217
  if not sf_result['success']:
218
  return jsonify({
219
+ 'error': f"Salesforce error: {sf_result.get('error', 'Unknown error')}",
220
  'results': results
221
  }), 500
222
 
223
  return jsonify({
224
  'success': True,
225
+ 'score': results.get('score', 0),
226
+ 'salesforce_id': sf_result.get('id'),
227
+ 'evaluation': results.get('categories', {}),
228
+ 'error': results.get('error', '')
229
  })
230
 
231
  except Exception as e:
 
235
  @app.route('/health', methods=['GET'])
236
  def health_check():
237
  """Health check endpoint"""
238
+ status = {
239
+ 'status': 'healthy' if processor and sf_handler else 'unhealthy',
240
+ 'torch_available': torch.cuda.is_available() if torch else False,
241
+ 'numpy_version': np.__version__,
242
+ 'numpy_working': False
243
+ }
244
+
245
+ try:
246
+ test_array = np.array([1, 2, 3])
247
+ status['numpy_working'] = test_array.sum() == 6
248
+ except Exception as e:
249
+ logging.error(f"Health check numpy test failed: {str(e)}")
250
+
251
+ return jsonify(status)
252
 
253
  if __name__ == '__main__':
254
  app.run(host='0.0.0.0', port=5000)