Tanxshh commited on
Commit
72d628f
·
verified ·
1 Parent(s): d07d383

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +100 -96
pipeline.py CHANGED
@@ -1,96 +1,100 @@
1
- import os
2
- import re
3
- import fitz # PyMuPDF
4
- import torch
5
- import pandas as pd
6
- from sentence_transformers import SentenceTransformer, util
7
-
8
- # Load model
9
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
11
-
12
- # Reference phrases
13
- env_ref = ["environment","climate change","carbon emissions","pollution","waste","green energy",
14
- "renewable resources","sustainability","biodiversity","eco-friendly","net zero",
15
- "solar energy","wind energy","water conservation"]
16
-
17
- esg_ref = ["environment","social responsibility","governance","sustainability","carbon emissions",
18
- "green energy","renewable resources","waste management","climate change","pollution control",
19
- "biodiversity","eco-friendly","net zero","solar energy","wind energy","water conservation",
20
- "community development","employee welfare","diversity","ethics"]
21
-
22
- action_ref = ["implemented","adopted","reduced emissions","recycled","renewable energy",
23
- "sustainability project","steps taken to reduce carbon emissions",
24
- "initiatives to help the environment","measures to prevent greenwashing"]
25
-
26
- claim_ref = ["plans to achieve","committed to","targets","pledges","goal","aims to",
27
- "intent to reduce","objective to be","aims for sustainability","pledged to achieve",
28
- "will reduce carbon","expect to reach net zero","plans to be carbon neutral by",
29
- "commitment to net zero by","goal to be eco friendly by","target year for sustainability",
30
- "striving to be net zero","intends to adopt renewable energy","aiming for eco-friendly operations"]
31
-
32
- # Extract text
33
- def extract_text(pdf_path):
34
- text = ""
35
- with fitz.open(pdf_path) as doc:
36
- for page in doc:
37
- text += page.get_text()
38
- return text
39
-
40
- def split_sentences(text):
41
- return re.split(r'(?<=[.!?])\s+', text)
42
-
43
- def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
44
- ref_emb = model.encode(reference, convert_to_tensor=True)
45
- matches = []
46
- for i in range(0, len(sentences), batch_size):
47
- batch = sentences[i:i+batch_size]
48
- sent_emb = model.encode(batch, convert_to_tensor=True)
49
- sim_matrix = util.cos_sim(sent_emb, ref_emb)
50
- for j, sim_scores in enumerate(sim_matrix):
51
- if sim_scores.max().item() >= threshold:
52
- matches.append(batch[j].strip())
53
- return matches if matches else ["NA"]
54
-
55
- # Pipeline for PDFs
56
- def run_pipeline(pdf_folder):
57
- data = []
58
- pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
59
-
60
- for pdf in pdf_files:
61
- company_name = os.path.splitext(pdf)[0]
62
- pdf_path = os.path.join(pdf_folder, pdf)
63
-
64
- text = extract_text(pdf_path)
65
- sentences = split_sentences(text)
66
- total_sentences = len(sentences) if sentences else 1 # avoid division by zero
67
-
68
- env_sentences = semantic_matches(sentences, env_ref)
69
- esg_sentences = semantic_matches(sentences, esg_ref)
70
- action_sentences = semantic_matches(sentences, action_ref)
71
- claim_sentences = semantic_matches(sentences, claim_ref, threshold=0.54)
72
-
73
- env_count = len([s for s in env_sentences if s != "NA"])
74
- esg_count = len([s for s in esg_sentences if s != "NA"])
75
- action_count = len([s for s in action_sentences if s != "NA"])
76
- claim_count = len([s for s in claim_sentences if s != "NA"])
77
-
78
- env_score = (env_count / total_sentences) * 100
79
- claim_score = (claim_count / total_sentences) * 100
80
- action_score = (action_count / total_sentences) * 100
81
- relative_focus = (esg_count / total_sentences) * 100
82
-
83
- net_action = action_score - claim_score
84
- net_direction = "Positive" if net_action > 0 else "Negative"
85
-
86
- data.append({
87
- "Company": company_name,
88
- "Relative Focus Score": round(relative_focus, 2),
89
- "Environment Score": round(env_score, 2),
90
- "Claims Score": round(claim_score, 2),
91
- "Actions Score": round(action_score, 2),
92
- "Net Action": round(net_action, 2),
93
- "Direction": net_direction
94
- })
95
-
96
- return pd.DataFrame(data)
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz # PyMuPDF
4
+ import torch
5
+ import pandas as pd
6
+ from sentence_transformers import SentenceTransformer, util
7
+
8
+ # ======= HF Spaces Docker Fix =======
9
+ os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
10
+ os.environ["HF_HOME"] = "/app/cache"
11
+
12
+ # Load model
13
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
14
+ model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
15
+
16
+ # Reference phrases
17
+ env_ref = ["environment","climate change","carbon emissions","pollution","waste","green energy",
18
+ "renewable resources","sustainability","biodiversity","eco-friendly","net zero",
19
+ "solar energy","wind energy","water conservation"]
20
+
21
+ esg_ref = ["environment","social responsibility","governance","sustainability","carbon emissions",
22
+ "green energy","renewable resources","waste management","climate change","pollution control",
23
+ "biodiversity","eco-friendly","net zero","solar energy","wind energy","water conservation",
24
+ "community development","employee welfare","diversity","ethics"]
25
+
26
+ action_ref = ["implemented","adopted","reduced emissions","recycled","renewable energy",
27
+ "sustainability project","steps taken to reduce carbon emissions",
28
+ "initiatives to help the environment","measures to prevent greenwashing"]
29
+
30
+ claim_ref = ["plans to achieve","committed to","targets","pledges","goal","aims to",
31
+ "intent to reduce","objective to be","aims for sustainability","pledged to achieve",
32
+ "will reduce carbon","expect to reach net zero","plans to be carbon neutral by",
33
+ "commitment to net zero by","goal to be eco friendly by","target year for sustainability",
34
+ "striving to be net zero","intends to adopt renewable energy","aiming for eco-friendly operations"]
35
+
36
+ # Extract text
37
+ def extract_text(pdf_path):
38
+ text = ""
39
+ with fitz.open(pdf_path) as doc:
40
+ for page in doc:
41
+ text += page.get_text()
42
+ return text
43
+
44
+ def split_sentences(text):
45
+ return re.split(r'(?<=[.!?])\s+', text)
46
+
47
+ def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
48
+ ref_emb = model.encode(reference, convert_to_tensor=True)
49
+ matches = []
50
+ for i in range(0, len(sentences), batch_size):
51
+ batch = sentences[i:i+batch_size]
52
+ sent_emb = model.encode(batch, convert_to_tensor=True)
53
+ sim_matrix = util.cos_sim(sent_emb, ref_emb)
54
+ for j, sim_scores in enumerate(sim_matrix):
55
+ if sim_scores.max().item() >= threshold:
56
+ matches.append(batch[j].strip())
57
+ return matches if matches else ["NA"]
58
+
59
+ # Pipeline for PDFs
60
+ def run_pipeline(pdf_folder):
61
+ data = []
62
+ pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
63
+
64
+ for pdf in pdf_files:
65
+ company_name = os.path.splitext(pdf)[0]
66
+ pdf_path = os.path.join(pdf_folder, pdf)
67
+
68
+ text = extract_text(pdf_path)
69
+ sentences = split_sentences(text)
70
+ total_sentences = len(sentences) if sentences else 1 # avoid division by zero
71
+
72
+ env_sentences = semantic_matches(sentences, env_ref)
73
+ esg_sentences = semantic_matches(sentences, esg_ref)
74
+ action_sentences = semantic_matches(sentences, action_ref)
75
+ claim_sentences = semantic_matches(sentences, claim_ref, threshold=0.54)
76
+
77
+ env_count = len([s for s in env_sentences if s != "NA"])
78
+ esg_count = len([s for s in esg_sentences if s != "NA"])
79
+ action_count = len([s for s in action_sentences if s != "NA"])
80
+ claim_count = len([s for s in claim_sentences if s != "NA"])
81
+
82
+ env_score = (env_count / total_sentences) * 100
83
+ claim_score = (claim_count / total_sentences) * 100
84
+ action_score = (action_count / total_sentences) * 100
85
+ relative_focus = (esg_count / total_sentences) * 100
86
+
87
+ net_action = action_score - claim_score
88
+ net_direction = "Positive" if net_action > 0 else "Negative"
89
+
90
+ data.append({
91
+ "Company": company_name,
92
+ "Relative Focus Score": round(relative_focus, 2),
93
+ "Environment Score": round(env_score, 2),
94
+ "Claims Score": round(claim_score, 2),
95
+ "Actions Score": round(action_score, 2),
96
+ "Net Action": round(net_action, 2),
97
+ "Direction": net_direction
98
+ })
99
+
100
+ return pd.DataFrame(data)