Tanxshh commited on
Commit
fb5cf67
·
verified ·
1 Parent(s): bc5c5c2

Upload 5 files

Browse files
Files changed (5) hide show
  1. index.html +75 -0
  2. main.py +39 -0
  3. pipeline.py +96 -0
  4. requirements.txt +7 -0
  5. start.sh +1 -0
index.html ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>ESG PDF Analyzer</title>
6
+ <script src="https://cdn.tailwindcss.com"></script>
7
+ </head>
8
+ <body class="bg-gray-100 min-h-screen flex flex-col items-center justify-start p-6">
9
+
10
+ <h1 class="text-3xl font-bold mb-6 text-center text-green-700">ESG PDF Analyzer</h1>
11
+
12
+ <div class="bg-white shadow-lg rounded-lg p-6 w-full max-w-xl">
13
+ <p class="mb-4 text-gray-600">Upload one or more PDFs to analyze ESG scores. Results will be saved automatically.</p>
14
+
15
+ <form id="pdfForm" class="flex flex-col space-y-4">
16
+ <input type="file" id="pdfFile" name="files" multiple
17
+ class="border p-2 rounded focus:outline-none focus:ring-2 focus:ring-green-500" accept=".pdf">
18
+ <button type="submit"
19
+ class="bg-green-600 text-white py-2 rounded hover:bg-green-700 transition-colors">Upload & Analyze</button>
20
+ </form>
21
+
22
+ <div id="loading" class="hidden mt-4 text-blue-600 font-semibold">Processing PDFs, please wait...</div>
23
+
24
+ <div id="resultContainer" class="mt-6 hidden">
25
+ <h2 class="text-xl font-semibold mb-2 text-gray-700">Results:</h2>
26
+ <div id="result" class="bg-gray-50 p-4 rounded max-h-96 overflow-auto"></div>
27
+ </div>
28
+ </div>
29
+
30
+ <script>
31
+ const form = document.getElementById("pdfForm");
32
+ const resultContainer = document.getElementById("resultContainer");
33
+ const result = document.getElementById("result");
34
+ const loading = document.getElementById("loading");
35
+
36
+ form.addEventListener("submit", async (e) => {
37
+ e.preventDefault();
38
+
39
+ const files = document.getElementById("pdfFile").files;
40
+ if (files.length === 0) {
41
+ alert("Please select at least one PDF file.");
42
+ return;
43
+ }
44
+
45
+ const formData = new FormData();
46
+ for (let i = 0; i < files.length; i++) {
47
+ formData.append("files", files[i]);
48
+ }
49
+
50
+ loading.classList.remove("hidden");
51
+ resultContainer.classList.add("hidden");
52
+ result.textContent = "";
53
+
54
+ try {
55
+ const response = await fetch("http://127.0.0.1:8000/analyze-pdfs/", {
56
+ method: "POST",
57
+ body: formData
58
+ });
59
+
60
+ if (!response.ok) throw new Error("Upload failed.");
61
+
62
+ const data = await response.json();
63
+ result.textContent = JSON.stringify(data, null, 2);
64
+ resultContainer.classList.remove("hidden");
65
+ } catch (err) {
66
+ result.textContent = "Error: " + err.message;
67
+ resultContainer.classList.remove("hidden");
68
+ } finally {
69
+ loading.classList.add("hidden");
70
+ }
71
+ });
72
+ </script>
73
+
74
+ </body>
75
+ </html>
main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import tempfile, shutil, os
5
+ import pandas as pd
6
+ from pipeline import run_pipeline
7
+
8
+ app = FastAPI(title="SC API", version="1.0")
9
+
10
+ # Allow frontend to call API
11
+ app.add_middleware(
12
+ CORSMiddleware,
13
+ allow_origins=["*"],
14
+ allow_methods=["*"],
15
+ allow_headers=["*"],
16
+ )
17
+
18
+ DATASET_PATH = "dataset.csv"
19
+
20
+ @app.post("/analyze-pdfs/")
21
+ async def analyze_pdfs(files: list[UploadFile] = File(...)):
22
+ with tempfile.TemporaryDirectory() as tmpdirname:
23
+ for file in files:
24
+ file_path = os.path.join(tmpdirname, file.filename)
25
+ with open(file_path, "wb") as buffer:
26
+ shutil.copyfileobj(file.file, buffer)
27
+
28
+ results = run_pipeline(tmpdirname)
29
+ json_result = results.to_dict(orient="records")
30
+
31
+ # Save to dataset.csv
32
+ if os.path.exists(DATASET_PATH):
33
+ dataset = pd.read_csv(DATASET_PATH)
34
+ dataset = pd.concat([dataset, results], ignore_index=True)
35
+ else:
36
+ dataset = results
37
+
38
+ dataset.to_csv(DATASET_PATH, index=False)
39
+ return JSONResponse(content={"results": json_result})
pipeline.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz # PyMuPDF
4
+ import torch
5
+ import pandas as pd
6
+ from sentence_transformers import SentenceTransformer, util
7
+
8
+ # Load model
9
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
11
+
12
+ # Reference phrases
13
+ env_ref = ["environment","climate change","carbon emissions","pollution","waste","green energy",
14
+ "renewable resources","sustainability","biodiversity","eco-friendly","net zero",
15
+ "solar energy","wind energy","water conservation"]
16
+
17
+ esg_ref = ["environment","social responsibility","governance","sustainability","carbon emissions",
18
+ "green energy","renewable resources","waste management","climate change","pollution control",
19
+ "biodiversity","eco-friendly","net zero","solar energy","wind energy","water conservation",
20
+ "community development","employee welfare","diversity","ethics"]
21
+
22
+ action_ref = ["implemented","adopted","reduced emissions","recycled","renewable energy",
23
+ "sustainability project","steps taken to reduce carbon emissions",
24
+ "initiatives to help the environment","measures to prevent greenwashing"]
25
+
26
+ claim_ref = ["plans to achieve","committed to","targets","pledges","goal","aims to",
27
+ "intent to reduce","objective to be","aims for sustainability","pledged to achieve",
28
+ "will reduce carbon","expect to reach net zero","plans to be carbon neutral by",
29
+ "commitment to net zero by","goal to be eco friendly by","target year for sustainability",
30
+ "striving to be net zero","intends to adopt renewable energy","aiming for eco-friendly operations"]
31
+
32
+ # Extract text
33
+ def extract_text(pdf_path):
34
+ text = ""
35
+ with fitz.open(pdf_path) as doc:
36
+ for page in doc:
37
+ text += page.get_text()
38
+ return text
39
+
40
+ def split_sentences(text):
41
+ return re.split(r'(?<=[.!?])\s+', text)
42
+
43
+ def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
44
+ ref_emb = model.encode(reference, convert_to_tensor=True)
45
+ matches = []
46
+ for i in range(0, len(sentences), batch_size):
47
+ batch = sentences[i:i+batch_size]
48
+ sent_emb = model.encode(batch, convert_to_tensor=True)
49
+ sim_matrix = util.cos_sim(sent_emb, ref_emb)
50
+ for j, sim_scores in enumerate(sim_matrix):
51
+ if sim_scores.max().item() >= threshold:
52
+ matches.append(batch[j].strip())
53
+ return matches if matches else ["NA"]
54
+
55
+ # Pipeline for PDFs
56
+ def run_pipeline(pdf_folder):
57
+ data = []
58
+ pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
59
+
60
+ for pdf in pdf_files:
61
+ company_name = os.path.splitext(pdf)[0]
62
+ pdf_path = os.path.join(pdf_folder, pdf)
63
+
64
+ text = extract_text(pdf_path)
65
+ sentences = split_sentences(text)
66
+ total_sentences = len(sentences) if sentences else 1 # avoid division by zero
67
+
68
+ env_sentences = semantic_matches(sentences, env_ref)
69
+ esg_sentences = semantic_matches(sentences, esg_ref)
70
+ action_sentences = semantic_matches(sentences, action_ref)
71
+ claim_sentences = semantic_matches(sentences, claim_ref, threshold=0.54)
72
+
73
+ env_count = len([s for s in env_sentences if s != "NA"])
74
+ esg_count = len([s for s in esg_sentences if s != "NA"])
75
+ action_count = len([s for s in action_sentences if s != "NA"])
76
+ claim_count = len([s for s in claim_sentences if s != "NA"])
77
+
78
+ env_score = (env_count / total_sentences) * 100
79
+ claim_score = (claim_count / total_sentences) * 100
80
+ action_score = (action_count / total_sentences) * 100
81
+ relative_focus = (esg_count / total_sentences) * 100
82
+
83
+ net_action = action_score - claim_score
84
+ net_direction = "Positive" if net_action > 0 else "Negative"
85
+
86
+ data.append({
87
+ "Company": company_name,
88
+ "Relative Focus Score": round(relative_focus, 2),
89
+ "Environment Score": round(env_score, 2),
90
+ "Claims Score": round(claim_score, 2),
91
+ "Actions Score": round(action_score, 2),
92
+ "Net Action": round(net_action, 2),
93
+ "Direction": net_direction
94
+ })
95
+
96
+ return pd.DataFrame(data)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pandas
4
+ torch
5
+ sentence-transformers
6
+ python-multipart
7
+ PyMuPDF
start.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ uvicorn main:app --host 0.0.0.0 --port $PORT