Spaces:

Tanxshh
/

sc-api

Sleeping

App Files Files Community

Tanxshh commited on Aug 20, 2025

Commit

fb5cf67

verified ·

1 Parent(s): bc5c5c2

Upload 5 files

Browse files

Files changed (5) hide show

index.html +75 -0
main.py +39 -0
pipeline.py +96 -0
requirements.txt +7 -0
start.sh +1 -0

index.html ADDED Viewed

	@@ -0,0 +1,75 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>ESG PDF Analyzer</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-gray-100 min-h-screen flex flex-col items-center justify-start p-6">
+  <h1 class="text-3xl font-bold mb-6 text-center text-green-700">ESG PDF Analyzer</h1>
+  <div class="bg-white shadow-lg rounded-lg p-6 w-full max-w-xl">
+    <p class="mb-4 text-gray-600">Upload one or more PDFs to analyze ESG scores. Results will be saved automatically.</p>
+    <form id="pdfForm" class="flex flex-col space-y-4">
+      <input type="file" id="pdfFile" name="files" multiple
+             class="border p-2 rounded focus:outline-none focus:ring-2 focus:ring-green-500" accept=".pdf">
+      <button type="submit"
+              class="bg-green-600 text-white py-2 rounded hover:bg-green-700 transition-colors">Upload & Analyze</button>
+    </form>
+    <div id="loading" class="hidden mt-4 text-blue-600 font-semibold">Processing PDFs, please wait...</div>
+    <div id="resultContainer" class="mt-6 hidden">
+      <h2 class="text-xl font-semibold mb-2 text-gray-700">Results:</h2>
+      <div id="result" class="bg-gray-50 p-4 rounded max-h-96 overflow-auto"></div>
+    </div>
+  </div>
+  <script>
+    const form = document.getElementById("pdfForm");
+    const resultContainer = document.getElementById("resultContainer");
+    const result = document.getElementById("result");
+    const loading = document.getElementById("loading");
+    form.addEventListener("submit", async (e) => {
+      e.preventDefault();
+      const files = document.getElementById("pdfFile").files;
+      if (files.length === 0) {
+        alert("Please select at least one PDF file.");
+        return;
+      }
+      const formData = new FormData();
+      for (let i = 0; i < files.length; i++) {
+        formData.append("files", files[i]);
+      }
+      loading.classList.remove("hidden");
+      resultContainer.classList.add("hidden");
+      result.textContent = "";
+      try {
+        const response = await fetch("http://127.0.0.1:8000/analyze-pdfs/", {
+          method: "POST",
+          body: formData
+        });
+        if (!response.ok) throw new Error("Upload failed.");
+        const data = await response.json();
+        result.textContent = JSON.stringify(data, null, 2);
+        resultContainer.classList.remove("hidden");
+      } catch (err) {
+        result.textContent = "Error: " + err.message;
+        resultContainer.classList.remove("hidden");
+      } finally {
+        loading.classList.add("hidden");
+      }
+    });
+  </script>
+</body>
+</html>

main.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from fastapi import FastAPI, UploadFile, File
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+import tempfile, shutil, os
+import pandas as pd
+from pipeline import run_pipeline
+app = FastAPI(title="SC API", version="1.0")
+# Allow frontend to call API
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+DATASET_PATH = "dataset.csv"
+@app.post("/analyze-pdfs/")
+async def analyze_pdfs(files: list[UploadFile] = File(...)):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for file in files:
+            file_path = os.path.join(tmpdirname, file.filename)
+            with open(file_path, "wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+        results = run_pipeline(tmpdirname)
+        json_result = results.to_dict(orient="records")
+        # Save to dataset.csv
+        if os.path.exists(DATASET_PATH):
+            dataset = pd.read_csv(DATASET_PATH)
+            dataset = pd.concat([dataset, results], ignore_index=True)
+        else:
+            dataset = results
+        dataset.to_csv(DATASET_PATH, index=False)
+        return JSONResponse(content={"results": json_result})

pipeline.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import re
+import fitz  # PyMuPDF
+import torch
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+# Load model
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
+# Reference phrases
+env_ref = ["environment","climate change","carbon emissions","pollution","waste","green energy",
+           "renewable resources","sustainability","biodiversity","eco-friendly","net zero",
+           "solar energy","wind energy","water conservation"]
+esg_ref = ["environment","social responsibility","governance","sustainability","carbon emissions",
+           "green energy","renewable resources","waste management","climate change","pollution control",
+           "biodiversity","eco-friendly","net zero","solar energy","wind energy","water conservation",
+           "community development","employee welfare","diversity","ethics"]
+action_ref = ["implemented","adopted","reduced emissions","recycled","renewable energy",
+              "sustainability project","steps taken to reduce carbon emissions",
+              "initiatives to help the environment","measures to prevent greenwashing"]
+claim_ref = ["plans to achieve","committed to","targets","pledges","goal","aims to",
+             "intent to reduce","objective to be","aims for sustainability","pledged to achieve",
+             "will reduce carbon","expect to reach net zero","plans to be carbon neutral by",
+             "commitment to net zero by","goal to be eco friendly by","target year for sustainability",
+             "striving to be net zero","intends to adopt renewable energy","aiming for eco-friendly operations"]
+# Extract text
+def extract_text(pdf_path):
+    text = ""
+    with fitz.open(pdf_path) as doc:
+        for page in doc:
+            text += page.get_text()
+    return text
+def split_sentences(text):
+    return re.split(r'(?<=[.!?])\s+', text)
+def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
+    ref_emb = model.encode(reference, convert_to_tensor=True)
+    matches = []
+    for i in range(0, len(sentences), batch_size):
+        batch = sentences[i:i+batch_size]
+        sent_emb = model.encode(batch, convert_to_tensor=True)
+        sim_matrix = util.cos_sim(sent_emb, ref_emb)
+        for j, sim_scores in enumerate(sim_matrix):
+            if sim_scores.max().item() >= threshold:
+                matches.append(batch[j].strip())
+    return matches if matches else ["NA"]
+# Pipeline for PDFs
+def run_pipeline(pdf_folder):
+    data = []
+    pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
+    for pdf in pdf_files:
+        company_name = os.path.splitext(pdf)[0]
+        pdf_path = os.path.join(pdf_folder, pdf)
+        text = extract_text(pdf_path)
+        sentences = split_sentences(text)
+        total_sentences = len(sentences) if sentences else 1  # avoid division by zero
+        env_sentences = semantic_matches(sentences, env_ref)
+        esg_sentences = semantic_matches(sentences, esg_ref)
+        action_sentences = semantic_matches(sentences, action_ref)
+        claim_sentences = semantic_matches(sentences, claim_ref, threshold=0.54)
+        env_count = len([s for s in env_sentences if s != "NA"])
+        esg_count = len([s for s in esg_sentences if s != "NA"])
+        action_count = len([s for s in action_sentences if s != "NA"])
+        claim_count = len([s for s in claim_sentences if s != "NA"])
+        env_score = (env_count / total_sentences) * 100
+        claim_score = (claim_count / total_sentences) * 100
+        action_score = (action_count / total_sentences) * 100
+        relative_focus = (esg_count / total_sentences) * 100
+        net_action = action_score - claim_score
+        net_direction = "Positive" if net_action > 0 else "Negative"
+        data.append({
+            "Company": company_name,
+            "Relative Focus Score": round(relative_focus, 2),
+            "Environment Score": round(env_score, 2),
+            "Claims Score": round(claim_score, 2),
+            "Actions Score": round(action_score, 2),
+            "Net Action": round(net_action, 2),
+            "Direction": net_direction
+        })
+    return pd.DataFrame(data)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn[standard]
+pandas
+torch
+sentence-transformers
+python-multipart
+PyMuPDF

start.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ uvicorn main:app --host 0.0.0.0 --port $PORT