Basementup commited on
Commit
951a8cb
·
verified ·
1 Parent(s): 35e138e

Upload legislation_manager.py

Browse files
Files changed (1) hide show
  1. legislation_manager.py +153 -0
legislation_manager.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import hashlib
4
+ import os
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from datetime import datetime
8
+ from huggingface_hub import HfApi
9
+
10
+ DATA_FILE = "/home/ubuntu/legislation_rules.json"
11
+
12
+ def load_data():
13
+ if os.path.exists(DATA_FILE):
14
+ with open(DATA_FILE, 'r') as f:
15
+ try:
16
+ return json.load(f)
17
+ except:
18
+ return []
19
+ return []
20
+
21
+ def save_data(data):
22
+ with open(DATA_FILE, 'w') as f:
23
+ json.dump(data, f, indent=2)
24
+
25
+ def get_canonical_hash(text):
26
+ return hashlib.sha256(text.strip().encode('utf-8')).hexdigest()
27
+
28
+ def add_rule_manually(act, title, text, source):
29
+ data = load_data()
30
+ det_id = get_canonical_hash(text)
31
+
32
+ if any(r['deterministic_id'] == det_id for r in data):
33
+ return "Error: This rule already exists in the dataset (matching hash)."
34
+
35
+ new_rule = {
36
+ "act": act,
37
+ "section_title": title,
38
+ "text": text,
39
+ "source_url": source,
40
+ "deterministic_id": det_id,
41
+ "added_at": datetime.now().isoformat()
42
+ }
43
+ data.append(new_rule)
44
+ save_data(data)
45
+ return f"Successfully added: {title} from {act}"
46
+
47
+ def scrape_fca_prin():
48
+ """
49
+ Scrapes the 12 Principles for Businesses from the FCA Handbook.
50
+ """
51
+ url = "https://handbook.fca.org.uk/handbook/PRIN/2/1.html"
52
+ try:
53
+ response = requests.get(url)
54
+ soup = BeautifulSoup(response.content, 'html.parser')
55
+
56
+ principles_table = soup.find('table')
57
+ if not principles_table:
58
+ return "Error: Could not find the Principles table on the FCA site."
59
+
60
+ rows = principles_table.find_all('tr')
61
+ added_count = 0
62
+ for row in rows:
63
+ cols = row.find_all('td')
64
+ if len(cols) >= 2:
65
+ title = cols[0].get_text(strip=True)
66
+ text = cols[1].get_text(strip=True)
67
+ status = add_rule_manually("FCA Handbook: PRIN", f"Principle {title}", text, url)
68
+ if "Successfully" in status:
69
+ added_count += 1
70
+
71
+ return f"Successfully ingested {added_count} FCA Principles."
72
+ except Exception as e:
73
+ return f"FCA Scraping failed: {str(e)}"
74
+
75
+ def sync_to_huggingface(token, dataset_id):
76
+ if not token or not dataset_id:
77
+ return "Error: HF Token and Dataset ID are required."
78
+
79
+ api = HfApi()
80
+ try:
81
+ api.upload_file(
82
+ path_or_fileobj=DATA_FILE,
83
+ path_in_repo="legislation_rules.json",
84
+ repo_id=dataset_id,
85
+ repo_type="dataset",
86
+ token=token
87
+ )
88
+ return f"Successfully synced dataset to {dataset_id}"
89
+ except Exception as e:
90
+ return f"Sync failed: {str(e)}"
91
+
92
+ def view_dataset_stats():
93
+ data = load_data()
94
+ if not data:
95
+ return "Dataset is currently empty."
96
+
97
+ stats = f"Total Rules: {len(data)}\n"
98
+ acts = {}
99
+ for r in data:
100
+ acts[r['act']] = acts.get(r['act'], 0) + 1
101
+
102
+ for act, count in acts.items():
103
+ stats += f"- {act}: {count} rules\n"
104
+ return stats
105
+
106
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
+ gr.Markdown("# ⚖️ Legislation & FCA Dataset Manager")
108
+ gr.Markdown("### Create and Expand your Deterministic Rules of Law Dataset")
109
+
110
+ with gr.Tab("➕ Add New Rule"):
111
+ with gr.Row():
112
+ with gr.Column():
113
+ act_input = gr.Textbox(label="Act/Legislation Name", placeholder="e.g., Consumer Rights Act 2015")
114
+ title_input = gr.Textbox(label="Section/Rule Title", placeholder="e.g., Section 9: Satisfactory Quality")
115
+ source_input = gr.Textbox(label="Source URL", placeholder="https://www.legislation.gov.uk/...")
116
+ text_input = gr.TextArea(label="Rule Text", placeholder="Paste the official text here...")
117
+ add_btn = gr.Button("Add to Deterministic Dataset", variant="primary")
118
+ with gr.Column():
119
+ add_status = gr.Textbox(label="Status")
120
+ stats_view = gr.Textbox(label="Dataset Stats", value=view_dataset_stats())
121
+ refresh_btn = gr.Button("Refresh Stats")
122
+
123
+ with gr.Tab("🏦 FCA Guidelines"):
124
+ gr.Markdown("### 🛠️ FCA Handbook Automation")
125
+ gr.Markdown("Click below to automatically ingest the 12 Principles for Businesses (PRIN) from the official FCA Handbook.")
126
+ fca_btn = gr.Button("Ingest FCA PRIN Principles", variant="secondary")
127
+ fca_status = gr.Textbox(label="FCA Ingestion Status")
128
+
129
+ with gr.Tab("☁️ Sync to Hugging Face"):
130
+ gr.Markdown("Push your local collection to your Hugging Face account.")
131
+ hf_token = gr.Textbox(label="HF Write Token", type="password")
132
+ hf_id = gr.Textbox(label="Dataset ID", placeholder="username/my-legal-dataset")
133
+ sync_btn = gr.Button("Sync Now")
134
+ sync_status = gr.Textbox(label="Sync Status")
135
+
136
+ add_btn.click(
137
+ fn=add_rule_manually,
138
+ inputs=[act_input, title_input, text_input, source_input],
139
+ outputs=add_status
140
+ )
141
+
142
+ fca_btn.click(fn=scrape_fca_prin, outputs=fca_status)
143
+
144
+ refresh_btn.click(fn=view_dataset_stats, outputs=stats_view)
145
+
146
+ sync_btn.click(
147
+ fn=sync_to_huggingface,
148
+ inputs=[hf_token, hf_id],
149
+ outputs=sync_status
150
+ )
151
+
152
+ if __name__ == "__main__":
153
+ demo.launch()