vkumartr commited on
Commit
827e9a8
·
verified ·
1 Parent(s): 48bd53f

Changes updated

Browse files
Files changed (1) hide show
  1. app.py +9 -15
app.py CHANGED
@@ -30,18 +30,17 @@ logger = logging.getLogger(__name__)
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
  COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
-
34
- # use_gpu = False
35
- # output_dir = 'output'
36
 
37
  # Check if environment variables are set
38
  if not MONGODB_URI:
39
- raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
40
 
41
  # Initialize MongoDB Connection
42
  db_client = MongoClient(MONGODB_URI)
43
  db = db_client[DATABASE_NAME]
44
  invoice_collection = db[COLLECTION_NAME]
 
45
 
46
  app = FastAPI(docs_url='/')
47
  use_gpu = False
@@ -82,7 +81,7 @@ def fetch_file_from_s3(file_key):
82
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
83
 
84
  # Function to summarize text using OpenAI GPT
85
- def extract_invoice_data(file_data, content_type):
86
  system_prompt = "You are an expert in document data extraction."
87
 
88
  # Convert file to Base64
@@ -123,7 +122,7 @@ def extract_invoice_data(file_data, content_type):
123
 
124
  # Clean and parse JSON output
125
  content = response.choices[0].message.content.strip()
126
- #cleaned_content = content.strip().strip('```json').strip('```')
127
 
128
  try:
129
  parsed_content = json.loads(cleaned_content)
@@ -136,7 +135,7 @@ def extract_invoice_data(file_data, content_type):
136
  logger.error(f"Error in data extraction: {e}")
137
  return {"error": str(e)}
138
 
139
- #def get_content_type_from_s3(file_key):
140
  """Fetch the content type (MIME type) of a file stored in S3."""
141
  try:
142
  response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
@@ -144,7 +143,6 @@ def extract_invoice_data(file_data, content_type):
144
  except Exception as e:
145
  raise Exception(f"Failed to get content type from S3: {str(e)}")
146
 
147
-
148
  # Dependency to check API Key
149
  def verify_api_key(api_key: str = Header(...)):
150
  if api_key != API_KEY:
@@ -164,7 +162,6 @@ def extract_text_from_file(
164
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
165
  try:
166
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
167
-
168
  if existing_document:
169
  existing_document["_id"] = str(existing_document["_id"])
170
  return {
@@ -172,7 +169,7 @@ def extract_text_from_file(
172
  "document": existing_document
173
  }
174
  # Fetch dynamic schema based on document type
175
- schema_doc = invoice_collection.find_one({"document_type": document_type})
176
  if not schema_doc:
177
  raise ValueError("No schema found for the given document type")
178
 
@@ -182,12 +179,10 @@ def extract_text_from_file(
182
 
183
  # Retrieve file from S3 and determine content type
184
  content_type = get_content_type_from_s3(file_key)
185
-
186
- # Extract and parse invoice data
187
  file_data, _ = fetch_file_from_s3(file_key)
188
  extracted_data = extract_invoice_data(file_data, content_type, json_schema)
189
 
190
- # Store extracted data in MongoDB
191
  document = {
192
  "file_key": file_key,
193
  "file_type": content_type,
@@ -196,7 +191,6 @@ def extract_text_from_file(
196
  "extracted_data": extracted_data
197
  }
198
 
199
- # Insert document into MongoDB
200
  try:
201
  inserted_doc = invoice_collection.insert_one(document)
202
  document_id = str(inserted_doc.inserted_id)
@@ -208,7 +202,7 @@ def extract_text_from_file(
208
  return {
209
  "message": "Document successfully stored in MongoDB",
210
  "document_id": document_id,
211
- "entityrefkey":entity_ref_key,
212
  "extracted_data": extracted_data
213
  }
214
 
 
30
  MONGODB_URI = os.getenv("MONGODB_URI")
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
  COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
+ SCHEMA = os.getenv("SCHEMA")
 
 
34
 
35
  # Check if environment variables are set
36
  if not MONGODB_URI:
37
+ raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
38
 
39
  # Initialize MongoDB Connection
40
  db_client = MongoClient(MONGODB_URI)
41
  db = db_client[DATABASE_NAME]
42
  invoice_collection = db[COLLECTION_NAME]
43
+ schema_collection = db[SCHEMA]
44
 
45
  app = FastAPI(docs_url='/')
46
  use_gpu = False
 
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
  # Function to summarize text using OpenAI GPT
84
+ def extract_invoice_data(file_data, content_type, json_schema):
85
  system_prompt = "You are an expert in document data extraction."
86
 
87
  # Convert file to Base64
 
122
 
123
  # Clean and parse JSON output
124
  content = response.choices[0].message.content.strip()
125
+ cleaned_content = content.strip().strip('```json').strip('```')
126
 
127
  try:
128
  parsed_content = json.loads(cleaned_content)
 
135
  logger.error(f"Error in data extraction: {e}")
136
  return {"error": str(e)}
137
 
138
+ def get_content_type_from_s3(file_key):
139
  """Fetch the content type (MIME type) of a file stored in S3."""
140
  try:
141
  response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
 
143
  except Exception as e:
144
  raise Exception(f"Failed to get content type from S3: {str(e)}")
145
 
 
146
  # Dependency to check API Key
147
  def verify_api_key(api_key: str = Header(...)):
148
  if api_key != API_KEY:
 
162
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
163
  try:
164
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
 
165
  if existing_document:
166
  existing_document["_id"] = str(existing_document["_id"])
167
  return {
 
169
  "document": existing_document
170
  }
171
  # Fetch dynamic schema based on document type
172
+ schema_doc = schema_collection.find_one({"document_type": document_type})
173
  if not schema_doc:
174
  raise ValueError("No schema found for the given document type")
175
 
 
179
 
180
  # Retrieve file from S3 and determine content type
181
  content_type = get_content_type_from_s3(file_key)
 
 
182
  file_data, _ = fetch_file_from_s3(file_key)
183
  extracted_data = extract_invoice_data(file_data, content_type, json_schema)
184
 
185
+ # Build document for insertion
186
  document = {
187
  "file_key": file_key,
188
  "file_type": content_type,
 
191
  "extracted_data": extracted_data
192
  }
193
 
 
194
  try:
195
  inserted_doc = invoice_collection.insert_one(document)
196
  document_id = str(inserted_doc.inserted_id)
 
202
  return {
203
  "message": "Document successfully stored in MongoDB",
204
  "document_id": document_id,
205
+ "entityrefkey": entity_ref_key,
206
  "extracted_data": extracted_data
207
  }
208