Spaces:

lenawilli
/

GDPR

Build error

App Files Files Community

lenawilli commited on Jun 25, 2025

Commit

aeb1acc

verified ·

1 Parent(s): 606351b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +8 -255

src/streamlit_app.py CHANGED Viewed

@@ -7,16 +7,6 @@ from transformers import AutoTokenizer, AutoModel
 from sklearn.metrics.pairwise import cosine_similarity
 import torch
 import re
-from typing import List, Dict, Any
-from openai import OpenAI
-from dotenv import load_dotenv
-import os
-from sentence_transformers import SentenceTransformer
-from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
-import os
-import networkx as nx
-from pyvis.network import Network
-import streamlit.components.v1 as components
 # ---------------------------
 # LegalBERT-based compliance checker
@@ -60,7 +50,6 @@ class GDPRComplianceChecker:
             full_text = f"Article {number}: {title}. {body}"
             gdpr_map[number] = {"title": title, "text": full_text}
             texts.append(full_text)
         embeddings = self.get_embeddings(texts)
         return gdpr_map, embeddings
@@ -115,36 +104,7 @@ def chunk_policy_text(text, chunk_size=500):
         chunks.append(current.strip())
     return [chunk for chunk in chunks if len(chunk) > 50]
-def prepare_article_text(article: Dict[str, Any]) -> str:
-    body = " ".join(
-        " ".join(sec.values()) if isinstance(sec, dict) else str(sec)
-        for sec in article.get("sections", [])
-    )
-    return f"Art. {article['article_number']} – {article['article_title']} {body}"
-def get_embedding(text: str) -> List[float]:
-    # If input is a list of strings, clean each string
-    if isinstance(text, list):
-        cleaned_text = [t.replace("\n", " ") for t in text]
-    else:  # single string
-        cleaned_text = text.replace("\n", " ")
-    resp = client.embeddings.create(model=EMBED_MODEL, input=cleaned_text)
-    if isinstance(cleaned_text, list):
-        return [item.embedding for item in resp.data]
-    else:
-        return resp.data[0].embedding
-def rdflib_to_networkx(rdflib_graph):
-    nx_graph = nx.MultiDiGraph()
-    for s, p, o in rdflib_graph:
-        nx_graph.add_edge(str(s), str(o), label=str(p))
-    return nx_graph
-def draw_pyvis_graph(nx_graph):
-    net = Network(height="600px", width="100%", directed=True, notebook=False)
-    net.from_nx(nx_graph)
-    net.repulsion(node_distance=200, central_gravity=0.33, spring_length=100, spring_strength=0.10, damping=0.95)
-    return net
 # ---------------------------
 # Streamlit interface
 # ---------------------------
@@ -159,7 +119,7 @@ with st.sidebar:
 if gdpr_file and policy_file:
     model_choice = st.selectbox(
         "Choose the model to use:",
-        ["Logistic Regression", "MultinomialNB", "LegalBERT (Eurlex)", "SentenceTransformer", "LLM Model", "Knowledge Graphs"]
     )
     gdpr_data = json.load(gdpr_file)
@@ -216,194 +176,9 @@ if gdpr_file and policy_file:
                 "article_scores": dict(article_scores)
             }
-        elif model_choice == "SentenceTransformer":
-            model = joblib.load("sentence_transformer_model.joblib")
-            gdpr_texts = []
-            gdpr_map = {}
-            for article in gdpr_data:
-                number, title = article["article_number"], article["article_title"]
-                body = " ".join([f"{k} {v}" for sec in article["sections"] for k, v in sec.items()])
-                full_text = f"Article {number}: {title}. {body}"
-                gdpr_map[number] = {
-                    "title": title,
-                    "text": full_text
-                }
-                gdpr_texts.append(full_text)
-            gdpr_embeddings = model.encode(gdpr_texts, convert_to_numpy=True)
-            chunks = chunk_policy_text(policy_text)
-            chunk_embeddings = model.encode(chunks, convert_to_numpy=True)
-            sim_matrix = cosine_similarity(gdpr_embeddings, chunk_embeddings)
-            article_scores = {}
-            presence_threshold = 0.35
-            total_score, counted_articles = 0, 0
-            for i, (art_num, art_data) in enumerate(gdpr_map.items()):
-                max_sim = np.max(sim_matrix[i])
-                best_idx = np.argmax(sim_matrix[i])
-                if max_sim < presence_threshold:
-                    continue
-                score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
-                article_scores[art_num] = {
-                    "article_title": art_data["title"],
-                    "compliance_percentage": round(score_pct, 2),
-                    "similarity_score": round(max_sim, 4),
-                    "matched_text_snippet": chunks[best_idx][:300] + "..."
-                }
-                total_score += score_pct
-                counted_articles += 1
-            overall = round(total_score / counted_articles, 2) if counted_articles else 0
-            result = {
-                "overall_compliance_percentage": overall,
-                "relevant_articles_analyzed": counted_articles,
-                "total_policy_chunks": len(chunks),
-                "article_scores": article_scores
-            }
-        elif model_choice == "LLM Model":
-            load_dotenv()
-            api_key = os.getenv("OPENAI_API_KEY")
-            client = OpenAI(api_key=api_key)
-            EMBED_MODEL = "text-embedding-3-small"
-            gdpr_embeddings = {}
-            gdpr_map = {}
-            for art in gdpr_data:
-                number, title = art["article_number"], art["article_title"]
-                art_text = prepare_article_text(art)
-                gdpr_embeddings[art["article_number"]] = {
-                    "embedding": get_embedding(art_text),
-                    "title": art["article_title"]
-                }
-                gdpr_map[number] = {"title": title, "text": art_text}
-            chunks = chunk_policy_text(policy_text)
-            chunk_embeddings = get_embedding(chunks)
-            gdpr_embedding_vectors = [v["embedding"] for v in gdpr_embeddings.values()]
-            sim_matrix = cosine_similarity(gdpr_embedding_vectors, chunk_embeddings)
-            article_scores = {}
-            presence_threshold = 0.35
-            total_score, counted_articles = 0, 0
-            for i, (art_num, art_data) in enumerate(gdpr_map.items()):
-                max_sim = np.max(sim_matrix[i])
-                best_idx = np.argmax(sim_matrix[i])
-                if max_sim < presence_threshold:
-                    continue
-                score_pct = min(100, max(0, (max_sim - presence_threshold) / (1 - presence_threshold) * 100))
-                article_scores[art_num] = {
-                    "article_title": art_data["title"],
-                    "compliance_percentage": round(score_pct, 2),
-                    "similarity_score": round(max_sim, 4),
-                    "matched_text_snippet": chunks[best_idx][:300] + "..."
-                }
-                total_score += score_pct
-                counted_articles += 1
-            overall = round(total_score / counted_articles, 2) if counted_articles else 0
-            result = {
-                "overall_compliance_percentage": overall,
-                "relevant_articles_analyzed": counted_articles,
-                "total_policy_chunks": len(chunks),
-                "article_scores": article_scores
-            }
         elif model_choice == "Knowledge Graphs":
-            EMBED_MODEL = "all-MiniLM-L6-v2"
-            model = SentenceTransformer(EMBED_MODEL)
-            TOP_N = 1
-            BASE_URI = "http://example.org/gdpr#"
-            gdpr_embeddings = {}
-            gdpr_map = {}
-            for art in gdpr_data:
-                number, title = art["article_number"], art["article_title"]
-                art_text = prepare_article_text(art)
-                gdpr_embeddings[art["article_number"]] = {
-                    "embedding": model.encode(art_text),
-                    "title": art["article_title"],
-                    "uri": URIRef(f"{BASE_URI}Article{art['article_number']}")
-                }
-                gdpr_map[number] = {"title": title, "text": art_text}
-            g = Graph()
-            EX = Namespace(BASE_URI)
-            g.bind("ex", EX)
-            # Add article nodes
-            for num, info in gdpr_embeddings.items():
-                g.add((info["uri"], RDF.type, EX.Article))
-                g.add((info["uri"], RDFS.label, Literal(f"Article {num}: {info['title']}")))
-            # Extract GDPR article vectors
-            article_nums = list(gdpr_embeddings.keys())
-            article_vectors = np.array([gdpr_embeddings[num]["embedding"] for num in article_nums])
-            # Score tracking
-            total_score = 0
-            counted_sections = 0
-            chunks = chunk_policy_text(policy_text)
-            report = []
-            presence_threshold = 0.35
-            # Process each policy chunk
-            for idx, text in enumerate(chunks, start=1):
-                if not text.strip():
-                    continue
-                # RDF section node
-                sec_uri = URIRef(f"{BASE_URI}PolicySection{idx}")
-                g.add((sec_uri, RDF.type, EX.PolicySection))
-                g.add((sec_uri, RDFS.label, Literal(f"Section {idx}")))
-                # Embed section
-                sec_emb = model.encode(text)
-                # Similarities to all articles
-                sims = []
-                for i, art_num in enumerate(article_nums):
-                    art_emb = article_vectors[i]
-                    sim = cosine_similarity([sec_emb], [art_emb])[0][0]
-                    sims.append({
-                        "article": art_num,
-                        "title": gdpr_embeddings[art_num]["title"],
-                        "similarity": round(sim, 4),
-                        "uri": gdpr_embeddings[art_num]["uri"],
-                        "text": gdpr_map[art_num]["text"]
-                    })
-                # Sort and pick best match
-                sims.sort(key=lambda x: x["similarity"], reverse=True)
-                top_match = sims[0]
-                # Threshold filtering
-                if top_match["similarity"] < presence_threshold:
-                    continue
-                # Compliance score
-                score_pct = min(100, max(0, (top_match["similarity"] - presence_threshold) / (1 - presence_threshold) * 100))
-                # Add RDF triples
-                g.add((sec_uri, EX.relatesTo, top_match["uri"]))
-                g.add((sec_uri, EX.similarityScore, Literal(top_match["similarity"], datatype=XSD.float)))
-                g.serialize(destination="gdpr_policy_graph.ttl", format="turtle")
-                total_score += score_pct
-                counted_sections += 1
-            # Final summary
-            overall = round(total_score / counted_sections, 2) if counted_sections else 0
-            result = {
-                "overall_compliance_percentage": overall,
-                "relevant_sections_analyzed": counted_sections,
-                "total_policy_sections": len(chunks),
-                "ttl": True
-            }
         else:
             result = {}
@@ -412,31 +187,9 @@ if gdpr_file and policy_file:
         st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%")
         st.markdown("---")
         st.subheader("📋 Detailed Article Breakdown")
-        ttl_file_path = "gdpr_policy_graph.ttl"
-        if result.get('article_scores'):
-            for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
-                with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
-                    st.write(f"**Similarity Score**: {data['similarity_score']}")
-                    st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")
-        elif result.get("ttl") and os.path.exists(ttl_file_path):
-            st.markdown("---")
-            st.subheader("🧠 Interactive RDF Graph Visualization")
-            g = Graph()
-            g.parse(ttl_file_path, format="ttl")
-            nx_graph = rdflib_to_networkx(g)
-            net = draw_pyvis_graph(nx_graph)
-            # Save the interactive graph temporarily
-            net.save_graph("rdf_graph.html")
-            HtmlFile = open("rdf_graph.html", "r", encoding="utf-8").read()
-            # Display interactive graph inside Streamlit
-            components.html(HtmlFile, height=650, scrolling=True)
-        else:
-            st.info("No article scores or RDF graph to display.")
 else:
-    st.info("Please upload both a GDPR JSON file and a company policy text file to begin.")

 from sklearn.metrics.pairwise import cosine_similarity
 import torch
 import re
 # ---------------------------
 # LegalBERT-based compliance checker
             full_text = f"Article {number}: {title}. {body}"
             gdpr_map[number] = {"title": title, "text": full_text}
             texts.append(full_text)
         embeddings = self.get_embeddings(texts)
         return gdpr_map, embeddings
         chunks.append(current.strip())
     return [chunk for chunk in chunks if len(chunk) > 50]
 # ---------------------------
 # Streamlit interface
 # ---------------------------
 if gdpr_file and policy_file:
     model_choice = st.selectbox(
         "Choose the model to use:",
+        ["Logistic Regression", "MultinomialNB", "LegalBERT (Eurlex)", "Knowledge Graphs"]
     )
     gdpr_data = json.load(gdpr_file)
                 "article_scores": dict(article_scores)
             }
         elif model_choice == "Knowledge Graphs":
+            st.warning("Knowledge Graphs model is not implemented yet.")
+            result = {}
         else:
             result = {}
         st.subheader(f"✅ Overall Compliance Score: {result['overall_compliance_percentage']}%")
         st.markdown("---")
         st.subheader("📋 Detailed Article Breakdown")
+        for art_num, data in sorted(result['article_scores'].items(), key=lambda x: -x[1]['compliance_percentage']):
+            with st.expander(f"Article {art_num} - {data['article_title']} ({data['compliance_percentage']}%)"):
+                st.write(f"**Similarity Score**: {data['similarity_score']}")
+                st.write(f"**Matched Text**:\n\n{data['matched_text_snippet']}")
 else:
+    st.info("Please upload both a GDPR JSON file and a company policy text file to begin.")