Tanxshh commited on
Commit
c5ea76d
·
verified ·
1 Parent(s): 4b45f0a

Upload 4 files

Browse files
Files changed (4) hide show
  1. dockerfile +12 -0
  2. pipeline.py +96 -0
  3. requirements.txt +7 -0
  4. server.py +25 -0
dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["python", "server.py"]
pipeline.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz # PyMuPDF
4
+ import torch
5
+ import pandas as pd
6
+ from sentence_transformers import SentenceTransformer, util
7
+
8
+ # Load model
9
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
11
+
12
+ # Reference phrases
13
+ env_ref = ["environment","climate change","carbon emissions","pollution","waste","green energy",
14
+ "renewable resources","sustainability","biodiversity","eco-friendly","net zero",
15
+ "solar energy","wind energy","water conservation"]
16
+
17
+ esg_ref = ["environment","social responsibility","governance","sustainability","carbon emissions",
18
+ "green energy","renewable resources","waste management","climate change","pollution control",
19
+ "biodiversity","eco-friendly","net zero","solar energy","wind energy","water conservation",
20
+ "community development","employee welfare","diversity","ethics"]
21
+
22
+ action_ref = ["implemented","adopted","reduced emissions","recycled","renewable energy",
23
+ "sustainability project","steps taken to reduce carbon emissions",
24
+ "initiatives to help the environment","measures to prevent greenwashing"]
25
+
26
+ claim_ref = ["plans to achieve","committed to","targets","pledges","goal","aims to",
27
+ "intent to reduce","objective to be","aims for sustainability","pledged to achieve",
28
+ "will reduce carbon","expect to reach net zero","plans to be carbon neutral by",
29
+ "commitment to net zero by","goal to be eco friendly by","target year for sustainability",
30
+ "striving to be net zero","intends to adopt renewable energy","aiming for eco-friendly operations"]
31
+
32
+ # Extract text
33
+ def extract_text(pdf_path):
34
+ text = ""
35
+ with fitz.open(pdf_path) as doc:
36
+ for page in doc:
37
+ text += page.get_text()
38
+ return text
39
+
40
+ def split_sentences(text):
41
+ return re.split(r'(?<=[.!?])\s+', text)
42
+
43
+ def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
44
+ ref_emb = model.encode(reference, convert_to_tensor=True)
45
+ matches = []
46
+ for i in range(0, len(sentences), batch_size):
47
+ batch = sentences[i:i+batch_size]
48
+ sent_emb = model.encode(batch, convert_to_tensor=True)
49
+ sim_matrix = util.cos_sim(sent_emb, ref_emb)
50
+ for j, sim_scores in enumerate(sim_matrix):
51
+ if sim_scores.max().item() >= threshold:
52
+ matches.append(batch[j].strip())
53
+ return matches if matches else ["NA"]
54
+
55
+ # Pipeline for PDFs
56
+ def run_pipeline(pdf_folder):
57
+ data = []
58
+ pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
59
+
60
+ for pdf in pdf_files:
61
+ company_name = os.path.splitext(pdf)[0]
62
+ pdf_path = os.path.join(pdf_folder, pdf)
63
+
64
+ text = extract_text(pdf_path)
65
+ sentences = split_sentences(text)
66
+ total_sentences = len(sentences) if sentences else 1 # avoid division by zero
67
+
68
+ env_sentences = semantic_matches(sentences, env_ref)
69
+ esg_sentences = semantic_matches(sentences, esg_ref)
70
+ action_sentences = semantic_matches(sentences, action_ref)
71
+ claim_sentences = semantic_matches(sentences, claim_ref, threshold=0.54)
72
+
73
+ env_count = len([s for s in env_sentences if s != "NA"])
74
+ esg_count = len([s for s in esg_sentences if s != "NA"])
75
+ action_count = len([s for s in action_sentences if s != "NA"])
76
+ claim_count = len([s for s in claim_sentences if s != "NA"])
77
+
78
+ env_score = (env_count / total_sentences) * 100
79
+ claim_score = (claim_count / total_sentences) * 100
80
+ action_score = (action_count / total_sentences) * 100
81
+ relative_focus = (esg_count / total_sentences) * 100
82
+
83
+ net_action = action_score - claim_score
84
+ net_direction = "Positive" if net_action > 0 else "Negative"
85
+
86
+ data.append({
87
+ "Company": company_name,
88
+ "Relative Focus Score": round(relative_focus, 2),
89
+ "Environment Score": round(env_score, 2),
90
+ "Claims Score": round(claim_score, 2),
91
+ "Actions Score": round(action_score, 2),
92
+ "Net Action": round(net_action, 2),
93
+ "Direction": net_direction
94
+ })
95
+
96
+ return pd.DataFrame(data)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ flask
2
+ pandas
3
+ PyMuPDF
4
+ sentence-transformers
5
+ torch
6
+ gspread
7
+ gspread_dataframe
server.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from pipeline import run_pipeline
3
+ import os
4
+
5
+ app = Flask(__name__)
6
+ PDF_FOLDER = "/app/pdfs"
7
+ os.makedirs(PDF_FOLDER, exist_ok=True)
8
+
9
+ @app.route("/upload_pdf", methods=["POST"])
10
+ def upload_pdf():
11
+ if 'pdf' not in request.files:
12
+ return jsonify({"error": "No PDF uploaded"}), 400
13
+
14
+ files = request.files.getlist('pdf')
15
+ for file in files:
16
+ path = os.path.join(PDF_FOLDER, file.filename)
17
+ file.save(path)
18
+
19
+ df = run_pipeline(PDF_FOLDER)
20
+
21
+
22
+ return df.to_json(orient="records")
23
+
24
+ if __name__ == "__main__":
25
+ app.run(host="0.0.0.0", port=7860)