Spaces:

Omkar008
/

receipt_radar_test

Sleeping

App Files Files Community

Omkar008 commited on Feb 7, 2024

Commit

0527f8f

verified ·

1 Parent(s): ccf236c

Update get_gmail_data.py

Browse files

Files changed (1) hide show

get_gmail_data.py +25 -5

get_gmail_data.py CHANGED Viewed

@@ -148,11 +148,22 @@ class GmailDataExtractor:
         if 'payload' in message_data and 'parts' in message_data['payload']:
             parts = message_data['payload']['parts']
             for part in parts:
                 if 'mimeType' not in part:
                     continue
                 mime_type = part['mimeType']
                 if mime_type == 'text/plain' or mime_type == 'text/html':
                     body_data = part['body'].get('data', '')
                     body = base64.urlsafe_b64decode(body_data).decode('utf-8')
@@ -166,9 +177,18 @@ class GmailDataExtractor:
                     if data:
                         # Save only the first 10 characters of the attachment data
-                        return subject,body ,{"filename":filename , "data":data}
-        return subject, body,None
     def extract_text_and_links(html_content: str) -> tuple:
         """
@@ -214,14 +234,14 @@ class GmailDataExtractor:
         messages = self.__fetch_messages()
         results = []
         for message in messages:
-            subject, body, attachment_data = self.__process_message(message)
             """ Handling None values """
-            subject = subject if subject is not None else ""
             body = body if body is not None else None
             attachment_data = attachment_data if attachment_data is not None else {}
-            results.append({"body": body, "attachment_data": [attachment_data]})
         return {"results": results}

         if 'payload' in message_data and 'parts' in message_data['payload']:
             parts = message_data['payload']['parts']
+            payload = message_data['payload']
+            brand_from_gmail = ''
+            company_from_gmail = ''
+            if payload['headers']['name'] == 'from':
+                brand_from_gmail = payload['headers']['value']
+                company_from_gmail = extract_domain_from_email(brand_from_gmail)
+            else:
+                company_from_gmail = None
             for part in parts:
                 if 'mimeType' not in part:
                     continue
                 mime_type = part['mimeType']
                 if mime_type == 'text/plain' or mime_type == 'text/html':
                     body_data = part['body'].get('data', '')
                     body = base64.urlsafe_b64decode(body_data).decode('utf-8')
                     if data:
                         # Save only the first 10 characters of the attachment data
+                        return subject,body ,{"filename":filename , "data":data} , company_from_gmail
+        return subject, body,None , company_from_gmail
+    def extract_domain_from_email(email):
+        regex = r"@(.+)$"
+        match = re.search(regex,email)
+        if match :
+            return match.group(1)
+        else:
+            return None
     def extract_text_and_links(html_content: str) -> tuple:
         """
         messages = self.__fetch_messages()
         results = []
         for message in messages:
+            subject, body, attachment_data , company_name = self.__process_message(message)
             """ Handling None values """
             body = body if body is not None else None
             attachment_data = attachment_data if attachment_data is not None else {}
+            company_associated = company_name if company_name is not None else None
+            results.append({"body": body, "attachment_data": [attachment_data] ,'company_associated':company_associated})
         return {"results": results}