Adi016 commited on
Commit
ffbf952
·
verified ·
1 Parent(s): 900227f

Upload folder using huggingface_hub

Browse files
__pycache__/classification.cpython-38.pyc ADDED
Binary file (5.49 kB). View file
 
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ import classification
4
+
5
+ app = Flask(__name__)
6
+ CORS(app)
7
+
8
+ @app.route('/api/classify', methods=['POST'])
9
+ def classify_email():
10
+ data = request.json
11
+
12
+ if not data or ('subject' not in data and 'body' not in data):
13
+ return jsonify({'error': 'Missing email content'}), 400
14
+
15
+ subject = data.get('subject', '')
16
+ body = data.get('body', '')
17
+
18
+ email = f"""
19
+ Subject: {subject}
20
+ Body: {body}
21
+ """
22
+
23
+ result = classification.classify(email)
24
+
25
+ return result
26
+
27
+ if __name__ == '__main__':
28
+ app.run(debug=True, port=5007)
checkpoint-770.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:440847c43b6723ad6dec74aa701f01bdd3783d440233b69f99b10aaf1565b05d
3
+ size 617581195
checkpoint-770/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "Intent_Amendment_Abstraction",
15
+ "1": "Intent_Clause_Protect",
16
+ "2": "Intent_Company_research",
17
+ "3": "Intent_Comparison_LOI_Lease",
18
+ "4": "Intent_Lease_Abstraction",
19
+ "5": "Intent_Lease_Listings_Comparison",
20
+ "6": "Intent_Sales_Listings_Comparison",
21
+ "7": "Intent_Transaction_Date_navigator"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "Intent_Amendment_Abstraction": 0,
27
+ "Intent_Clause_Protect": 1,
28
+ "Intent_Company_research": 2,
29
+ "Intent_Comparison_LOI_Lease": 3,
30
+ "Intent_Lease_Abstraction": 4,
31
+ "Intent_Lease_Listings_Comparison": 5,
32
+ "Intent_Sales_Listings_Comparison": 6,
33
+ "Intent_Transaction_Date_navigator": 7
34
+ },
35
+ "layer_norm_eps": 1e-05,
36
+ "max_position_embeddings": 514,
37
+ "model_type": "roberta",
38
+ "num_attention_heads": 12,
39
+ "num_hidden_layers": 12,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "problem_type": "single_label_classification",
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.51.3",
45
+ "type_vocab_size": 1,
46
+ "use_cache": true,
47
+ "vocab_size": 50265
48
+ }
checkpoint-770/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-770/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d157cae8ab980d89e5d426c84adeeed7a7c7aa988652ee0db903f0e554d5c96
3
+ size 498631280
checkpoint-770/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2d013e8a08903506b83e98e7795cebb6ed24e8d8cfa6be6ab33da27d900b37
3
+ size 254292026
checkpoint-770/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c63f61290f5421d6c6c8d31fafa5315c374c21561b10e72403200f87627ab2
3
+ size 14244
checkpoint-770/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b911e9d190a52c2fdaf1fc288a99472d1155b896916224bae1f6948903948c82
3
+ size 1064
checkpoint-770/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
checkpoint-770/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-770/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": false,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "extra_special_tokens": {},
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
checkpoint-770/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-770/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7a683e8060061a252378604e6aed2dc6535a81e8f3f319e777d656331eab38
3
+ size 5240
checkpoint-770/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
classification.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from scipy.special import softmax
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("/home/nobroker/Downloads/emailclassification/backend/checkpoint-770")
6
+ model = AutoModelForSequenceClassification.from_pretrained("/home/nobroker/Downloads/emailclassification/backend/checkpoint-770")
7
+
8
+ def classify(email) :
9
+
10
+ promptForIntentclassification = """
11
+ You are an AI who is an expert email analyzer and classification system specializing in real estate and legal documentation.
12
+ Your task is to accurately classify incoming emails into one of 8 predefined business intents. You must also identify emails that
13
+ contain mixed intents or fall outside these 8 categories.
14
+
15
+ **Here are the 8 predefined business intents:**
16
+
17
+ 0. **Intent_Amendment_Abstraction**: Emails requesting the extraction of new terms or highlighting changes introduced by a lease amendment compared to the original lease.
18
+ 1. **Intent_Clause_Protect**: Emails requesting a review of lease clauses to detect potentially risky, missing, or unfavorable terms (e.g., subletting rights, break clauses, indemnity, assignment terms, unreasonable liabilities, compliance issues).
19
+ 2. **Intent_Company_research**: Emails seeking background information or due diligence on a company involved in a transaction (e.g., credibility, litigation history, public disputes, bankruptcies, financial health, track record).
20
+ 3. **Intent_Comparison_LOI_Lease**: Emails asking to compare a Letter of Intent (LOI) with a final lease agreement to identify discrepancies, changes, or deviations in terms (e.g., TI allowances, common area maintenance, termination clauses).
21
+ 4. **Intent_Lease_Abstraction**: Emails requesting the extraction of key lease metadata and clauses (e.g., rent, term, landlord, tenant, renewal options, escalation schedules, important dates, responsibilities).
22
+ 5. **Intent_Lease_Listings_Comparison**: Emails asking to compare multiple lease listing summaries for properties, focusing on identifying the best terms, overlaps, gaps, per square foot pricing, and tenant-friendly clauses.
23
+ 6. **Intent_Sales_Listings_Comparison**: Emails asking to compare multiple sales listing summaries for properties, focusing on metrics like pricing, square footage, capitalization rate (cap rate), and average price per square foot (PSF).
24
+ 7. **Intent_Transaction_Date_navigator**: Emails focused on extracting, scheduling, or managing transaction-related dates (e.g., escrow, closing, notice periods, possession dates, due diligence deadlines, funding deadlines, inspection dates).
25
+
26
+ **If an email clearly contains elements of more than one of the above intents, or if its primary intent does not fit any of the 8 categories, classify it as "Intent_Mixed_Other".** This "Intent_Mixed_Other" category is crucial for handling complex or out-of-scope requests.
27
+
28
+ **Output Format:**
29
+
30
+ For each email, provide *only* the most appropriate intent label. Do not include any additional text or explanation strictly.
31
+
32
+
33
+ Here are some **Example Emails:**
34
+
35
+ **Email 1:**
36
+ Subject: Lease Summary for 123 Main St
37
+ Body: Hi team, please summarize the key terms of the lease for the 123 Main St property. I need to know the base rent, commencement and expiry dates, renewal options, and escalation schedule. Thanks!
38
+
39
+ **Classification 1:** Intent_Lease_Abstraction
40
+
41
+ **Email 2:**
42
+ Subject: LOI vs. Lease Discrepancies - 789 Oak Ave
43
+ Body: Hey, I need help comparing the LOI we submitted for 789 Oak Ave with the final lease. Can you identify any deviations, especially around TI allowances and common area maintenance? Appreciate your help.
44
+
45
+ **Classification 2:** Intent_Comparison_LOI_Lease
46
+
47
+ **Email 3:**
48
+ Subject: Review for Risky Clauses - New Lease for 456 Elm Rd
49
+ Body: Could you please review the new lease for 456 Elm Rd and detect any potentially risky or missing lease clauses, such as those related to subletting rights or indemnity? Best regards.
50
+
51
+ **Classification 3:** Intent_Clause_Protect
52
+
53
+ **Email 4:**
54
+ Subject: Background Check on Global Holdings Inc.
55
+ Body: Urgent: Can you do a background check on Global Holdings Inc. before we proceed? I’m particularly interested in any litigation history or bankruptcies in the past 5 years. Cheers.
56
+
57
+ **Classification 4:** Intent_Company_research
58
+
59
+
60
+ Here is the email {email}. Now classify this keeping all points in view. Do not hallaucinate.
61
+ """
62
+
63
+ id2label = {
64
+ 0: "Intent_Amendment_Abstraction",
65
+ 1: "Intent_Clause_Protect",
66
+ 2: "Intent_Company_research",
67
+ 3: "Intent_Comparison_LOI_Lease",
68
+ 4: "Intent_Lease_Abstraction",
69
+ 5: "Intent_Lease_Listings_Comparison",
70
+ 6: "Intent_Sales_Listings_Comparison",
71
+ 7: "Intent_Transaction_Date_navigator"
72
+ }
73
+
74
+ encoded_input = tokenizer(email, return_tensors='pt', truncation=True, padding=True, max_length = 256)
75
+
76
+ with torch.no_grad():
77
+ output = model(**encoded_input)
78
+
79
+ scores = output.logits[0].detach().numpy()
80
+ print(scores)
81
+ probs = softmax(scores)
82
+ print(probs)
83
+
84
+
85
+ predicted_class_id = probs.argmax()
86
+ print("here \n")
87
+ print(predicted_class_id)
88
+
89
+ if predicted_class_id >=0 and predicted_class_id <=7 :
90
+ print(predicted_class_id)
91
+ predicted_label = id2label[predicted_class_id]
92
+ print(predicted_label)
93
+ else :
94
+ predicted_label = "Intent_Mixed_Other"
95
+
96
+ return f"\n🧠 Predicted Category: {predicted_label}"
prompt.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ promptForIntentclassification = """
2
+ You are an AI who is an expert email analyzer and classification system specializing in real estate and legal documentation.
3
+ Your task is to accurately classify incoming emails into one of 8 predefined business intents. You must also identify emails that
4
+ contain mixed intents or fall outside these 8 categories.
5
+
6
+ **Here are the 8 predefined business intents:**
7
+
8
+ 1. **Intent_Lease_Abstraction**: Emails requesting the extraction of key lease metadata and clauses (e.g., rent, term, landlord, tenant, renewal options, escalation schedules, important dates, responsibilities).
9
+ 2. **Intent_Comparison_LOI_Lease**: Emails asking to compare a Letter of Intent (LOI) with a final lease agreement to identify discrepancies, changes, or deviations in terms (e.g., TI allowances, common area maintenance, termination clauses).
10
+ 3. **Intent_Clause_Protect**: Emails requesting a review of lease clauses to detect potentially risky, missing, or unfavorable terms (e.g., subletting rights, break clauses, indemnity, assignment terms, unreasonable liabilities, compliance issues).
11
+ 4. **Intent_Company_research**: Emails seeking background information or due diligence on a company involved in a transaction (e.g., credibility, litigation history, public disputes, bankruptcies, financial health, track record).
12
+ 5. **Intent_Transaction_Date_navigator**: Emails focused on extracting, scheduling, or managing transaction-related dates (e.g., escrow, closing, notice periods, possession dates, due diligence deadlines, funding deadlines, inspection dates).
13
+ 6. **Intent_Amendment_Abstraction**: Emails requesting the extraction of new terms or highlighting changes introduced by a lease amendment compared to the original lease.
14
+ 7. **Intent_Sales_Listings_Comparison**: Emails asking to compare multiple sales listing summaries for properties, focusing on metrics like pricing, square footage, capitalization rate (cap rate), and average price per square foot (PSF).
15
+ 8. **Intent_Lease_Listings_Comparison**: Emails asking to compare multiple lease listing summaries for properties, focusing on identifying the best terms, overlaps, gaps, per square foot pricing, and tenant-friendly clauses.
16
+
17
+ **If an email clearly contains elements of more than one of the above intents, or if its primary intent does not fit any of the 8 categories, classify it as "Intent_Mixed_Other".** This "Intent_Mixed_Other" category is crucial for handling complex or out-of-scope requests.
18
+
19
+ **Output Format:**
20
+
21
+ For each email, provide *only* the most appropriate intent label. Do not include any additional text or explanation strictly.
22
+
23
+
24
+ Here are some **Example Emails:**
25
+
26
+ **Email 1:**
27
+ Subject: Lease Summary for 123 Main St
28
+ Body: Hi team, please summarize the key terms of the lease for the 123 Main St property. I need to know the base rent, commencement and expiry dates, renewal options, and escalation schedule. Thanks!
29
+
30
+ **Classification 1:** Intent_Lease_Abstraction
31
+
32
+ **Email 2:**
33
+ Subject: LOI vs. Lease Discrepancies - 789 Oak Ave
34
+ Body: Hey, I need help comparing the LOI we submitted for 789 Oak Ave with the final lease. Can you identify any deviations, especially around TI allowances and common area maintenance? Appreciate your help.
35
+
36
+ **Classification 2:** Intent_Comparison_LOI_Lease
37
+
38
+ **Email 3:**
39
+ Subject: Review for Risky Clauses - New Lease for 456 Elm Rd
40
+ Body: Could you please review the new lease for 456 Elm Rd and detect any potentially risky or missing lease clauses, such as those related to subletting rights or indemnity? Best regards.
41
+
42
+ **Classification 3:** Intent_Clause_Protect
43
+
44
+ **Email 4:**
45
+ Subject: Background Check on Global Holdings Inc.
46
+ Body: Urgent: Can you do a background check on Global Holdings Inc. before we proceed? I’m particularly interested in any litigation history or bankruptcies in the past 5 years. Cheers.
47
+
48
+ **Classification 4:** Intent_Company_research
49
+
50
+
51
+ Here is the email {email}. Now classify this keeping all points in view. Do not hallaucinate.
52
+ """
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask>=2.2
2
+ flask-cors
3
+ werkzeug>=2.2
4
+ transformers
5
+ torch
6
+ scipy
7
+ datasets
8
+ scikit-learn
9
+ pandas