Spaces:

raminass
/

SCOTUS

Build error

App Files Files Community

raminass commited on Sep 29, 2023

Commit

8ddc567

1 Parent(s): 7268745

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +3 -9
app.py +33 -22
utils/.DS_Store +0 -0
utils/__init__.py +33 -8
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/cleaning.cpython-310.pyc +0 -0
utils/cleaning.py +109 -75

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: SCOTUS
-emoji: 😻
-colorFrom: gray
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.45.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: scotus
 app_file: app.py
+sdk: gradio
+sdk_version: 3.45.2
 ---

app.py CHANGED Viewed

@@ -1,36 +1,47 @@
 import gradio as gr
-from transformers import pipeline, TextClassificationPipeline
 from utils import *
 pipe = pipeline(model="raminass/scotus-v10", top_k=13, padding=True, truncation=True)
-def average_text(text, model):
-  # result = classifier(df_train[(df_train.case_name==case) & (df_train.category=='per_curiam')]['clean_text'].to_list())
-  result = model(text)
-  pred = {}
-  for c in result:
-    for d in c:
-      if d['label'] not in pred:
-        pred[d['label']] = [round(d['score'],2)]
-      else:
-        pred[d['label']].append(round(d['score'],2))
-  sumary = {k:round(sum(v)/len(v),2) for k,v in pred.items()}
-  result = [[{k: round(v, 2) if k=='score' else v for k, v in dct.items()} for dct in lst ]  for lst in result]
-  return dict(sorted(sumary.items(), key=lambda x: x[1],reverse=True)), result
 def greet(opinion):
-    result = average_text(chunk_data(remove_citations(opinion))['text'].to_list(),pipe)
-    # print(f"average prediction:")
-    # display(result[0])
-    # print(f"paragraph prediction:")
-    # display(result[1])
-    return result[0]
 with gr.Blocks() as demo:
     opinion = gr.Textbox(label="Opinion")
-    output = gr.Textbox(label="Result")
     greet_btn = gr.Button("Predict")
-    greet_btn.click(fn=greet, inputs=opinion, outputs=output, api_name="SCOTUS")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import pipeline
 from utils import *
 pipe = pipeline(model="raminass/scotus-v10", top_k=13, padding=True, truncation=True)
+max_textboxes = 100
+# https://www.gradio.app/guides/controlling-layout
 def greet(opinion):
+    chunks = chunk_data(remove_citations(opinion))["text"].to_list()
+    result = average_text(chunks, pipe)
+    k = len(chunks)
+    wrt_boxes = []
+    for i in range(k):
+        wrt_boxes.append(gr.Textbox(chunks[i], visible=True))
+        wrt_boxes.append(gr.Label(value=result[1][i], visible=True))
+    return (
+        [result[0]]
+        + wrt_boxes
+        + [gr.Textbox(visible=False), gr.Label(visible=False)] * (max_textboxes - k)
+    )
 with gr.Blocks() as demo:
     opinion = gr.Textbox(label="Opinion")
+    op_level = gr.outputs.Label(num_top_classes=13, label="Overall")
     greet_btn = gr.Button("Predict")
+    textboxes = []
+    for i in range(max_textboxes):
+        t = gr.Textbox(f"Textbox {i}", visible=False, label=f"Paragraph {i+1} Text")
+        par_level = gr.Label(
+            num_top_classes=5, label=f"Paragraph {i+1} Prediction", visible=False
+        )
+        textboxes.append(t)
+        textboxes.append(par_level)
+    greet_btn.click(
+        fn=greet,
+        inputs=opinion,
+        outputs=[op_level] + textboxes,
+    )
 if __name__ == "__main__":
     demo.launch()

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__init__.py CHANGED Viewed

@@ -3,14 +3,39 @@ import pandas as pd
 import numpy as np
 import json
-with open('utils/id2label.json', 'r') as j:
-     id2label = json.loads(j.read())
-with open('utils/label2id.json', 'r') as j:
-     label2id = json.loads(j.read())
-def find_case_by_name(df, name):
-  return display(HTML(df[df['case_name'].str.contains(name)].iloc[:,:-1].to_html(render_links=True, escape=False)))
-def head_df(df):
-  return display(HTML(df.iloc[:,:-1].head().to_html(render_links=True, escape=False)))

 import numpy as np
 import json
+with open("utils/id2label.json", "r") as j:
+    id2label = json.loads(j.read())
+with open("utils/label2id.json", "r") as j:
+    label2id = json.loads(j.read())
+def average_text(text, model):
+    # result = classifier(df_train[(df_train.case_name==case) & (df_train.category=='per_curiam')]['clean_text'].to_list())
+    result = model(text)
+    pred = {}
+    for c in result:
+        for d in c:
+            if d["label"] not in pred:
+                pred[d["label"]] = [round(d["score"], 2)]
+            else:
+                pred[d["label"]].append(round(d["score"], 2))
+    sumary = {k: round(sum(v) / len(v), 2) for k, v in pred.items()}
+    result = [{dct["label"]: round(dct["score"], 2) for dct in lst} for lst in result]
+    return dict(sorted(sumary.items(), key=lambda x: x[1], reverse=True)), result
+# def find_case_by_name(df, name):
+#     return display(
+#         HTML(
+#             df[df["case_name"].str.contains(name)]
+#             .iloc[:, :-1]
+#             .to_html(render_links=True, escape=False)
+#         )
+#     )
+# def head_df(df):
+#     return display(
+#         HTML(df.iloc[:, :-1].head().to_html(render_links=True, escape=False))
+#     )

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.54 kB). View file

utils/__pycache__/cleaning.cpython-310.pyc ADDED Viewed

Binary file (4.81 kB). View file

utils/cleaning.py CHANGED Viewed

@@ -1,123 +1,148 @@
 import subprocess
 import sys
 import re
-import pandas as pd
 try:
     import eyecite
 except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
 finally:
     from eyecite import find, clean
 # @title
 def full_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.year:
-      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
-      text = re.sub(pattern, '', text)
     if citation.metadata.pin_cite:
-      text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.plaintiff:
-      text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
-    publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
     if publisher_date:
-      text = text.replace(f"{publisher_date}", "")
     if citation.metadata.extra:
-      text = text.replace(citation.metadata.extra, "")
     return text
 def supra_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.pin_cite:
-      text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.antecedent_guess:
-      text = text.replace(citation.metadata.antecedent_guess, "")
     return text
 def short_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.year:
-      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
     if citation.metadata.antecedent_guess:
-      text = text.replace(citation.metadata.antecedent_guess, "")
     return text
 def id_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.pin_cite:
-      text = text.replace(citation.metadata.pin_cite, "")
     return text
 def unknown_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def full_law_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def full_journal_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.year:
-      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
-      text = re.sub(pattern, '', text)
     if citation.metadata.pin_cite:
-      text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
-      text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def all_commas(text: str) -> str:
     return re.sub(r"\,+", ",", text)
 def all_dots(text: str) -> str:
     return re.sub(r"\.+", ".", text)
 functions_dict = {
-    'FullCaseCitation': full_case,
-    'SupraCitation': supra_case,
-    'ShortCaseCitation': short_case,
-    'IdCitation': id_case,
-    'UnknownCitation': unknown_case,
-    'FullLawCitation': full_law_case,
-    'FullJournalCitation': full_journal_case,
 }
 # @title
 def remove_citations(input_text):
-  #clean text
-  plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
-  #remove citations
-  found_citations = find.get_citations(plain_text)
-  for citation in found_citations:
-    plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
-  #clean text
-  plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
-  plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
-  pattern = r"\*?\d*\s*I+\n"
-  plain_text = re.sub(pattern, '', plain_text)
-  pattern = r"\s[,.]"
-  plain_text = re.sub(pattern, '', plain_text)
-  return plain_text
 def split_text(text):
     words = text.split()
     chunks = []
     for i in range(0, len(words), 420):
-        chunks.append(' '.join(words[i:i+430]))
     return chunks
@@ -130,37 +155,46 @@ def chunk_text_to_paragraphs(text):
     return paragraphs
 # @title
 def split_data(data, id2label, label2id):
-  data_dict = {'author_name': [],
-              'label': [],
-              'category': [],
-              'case_name': [],
-              'url': [],
-              'text': []}
-  opinions_split = pd.DataFrame(data_dict)
-  opinions_split['label'] = opinions_split['label'].astype(int)
-  for index, row in data.iterrows():
-      # chunks = chunk_text_to_paragraphs(row['text'])
-      chunks = split_text(row['clean_text'])
-      for chunk in chunks:
-        if len(chunk)<1000:
-          continue
-        tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
-                              'category': row['category'],'case_name': row['case_name'],
-                              'url': [row['absolute_url']], 'text': [chunk]})
-        opinions_split = pd.concat([opinions_split, tmp])
-  return opinions_split
 def chunk_data(data):
-  data_dict = {'text': []}
-  opinions_split = pd.DataFrame(data_dict)
-  chunks = split_text(data)
-  for chunk in chunks:
-    if len(chunk)<1000:
-      continue
-    tmp = pd.DataFrame({'label': [200],'text': [chunk]})
-    opinions_split = pd.concat([opinions_split, tmp])
-  return opinions_split

 import subprocess
 import sys
 import re
+import pandas as pd
 try:
     import eyecite
 except ImportError:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"])
 finally:
     from eyecite import find, clean
 # @title
 def full_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.year:
+        pattern = r"\([^)]*{}\)".format(
+            citation.metadata.year
+        )  # Matches any word that ends with "year"
+        text = re.sub(pattern, "", text)
     if citation.metadata.pin_cite:
+        text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.plaintiff:
+        text = text.replace(
+            f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", ""
+        )
+    publisher_date = " ".join(
+        i for i in (citation.metadata.court, citation.metadata.year) if i
+    )
     if publisher_date:
+        text = text.replace(f"{publisher_date}", "")
     if citation.metadata.extra:
+        text = text.replace(citation.metadata.extra, "")
     return text
 def supra_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.pin_cite:
+        text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.antecedent_guess:
+        text = text.replace(citation.metadata.antecedent_guess, "")
     return text
 def short_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.year:
+        pattern = r"\([^)]*{}\)".format(citation.metadata.year)
     if citation.metadata.antecedent_guess:
+        text = text.replace(citation.metadata.antecedent_guess, "")
     return text
 def id_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     if citation.metadata.pin_cite:
+        text = text.replace(citation.metadata.pin_cite, "")
     return text
 def unknown_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def full_law_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def full_journal_case(citation, text):
     text = text.replace(citation.matched_text(), "")
     if citation.metadata.year:
+        pattern = r"\([^)]*{}\)".format(
+            citation.metadata.year
+        )  # Matches any word that ends with "year"
+        text = re.sub(pattern, "", text)
     if citation.metadata.pin_cite:
+        text = text.replace(citation.metadata.pin_cite, "")
     if citation.metadata.parenthetical:
+        text = text.replace(f"({citation.metadata.parenthetical})", "")
     return text
 def all_commas(text: str) -> str:
     return re.sub(r"\,+", ",", text)
 def all_dots(text: str) -> str:
     return re.sub(r"\.+", ".", text)
 functions_dict = {
+    "FullCaseCitation": full_case,
+    "SupraCitation": supra_case,
+    "ShortCaseCitation": short_case,
+    "IdCitation": id_case,
+    "UnknownCitation": unknown_case,
+    "FullLawCitation": full_law_case,
+    "FullJournalCitation": full_journal_case,
 }
 # @title
 def remove_citations(input_text):
+    # clean text
+    plain_text = clean.clean_text(
+        input_text, ["html", "inline_whitespace", "underscores"]
+    )
+    # remove citations
+    found_citations = find.get_citations(plain_text)
+    for citation in found_citations:
+        plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
+    # clean text
+    plain_text = clean.clean_text(
+        plain_text,
+        ["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots],
+    )
+    plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"])
+    pattern = r"\*?\d*\s*I+\n"
+    plain_text = re.sub(pattern, "", plain_text)
+    pattern = r"\s[,.]"
+    plain_text = re.sub(pattern, "", plain_text)
+    return plain_text
 def split_text(text):
     words = text.split()
     chunks = []
     for i in range(0, len(words), 420):
+        chunks.append(" ".join(words[i : i + 430]))
     return chunks
     return paragraphs
 # @title
 def split_data(data, id2label, label2id):
+    data_dict = {
+        "author_name": [],
+        "label": [],
+        "category": [],
+        "case_name": [],
+        "url": [],
+        "text": [],
+    }
+    opinions_split = pd.DataFrame(data_dict)
+    opinions_split["label"] = opinions_split["label"].astype(int)
+    for index, row in data.iterrows():
+        # chunks = chunk_text_to_paragraphs(row['text'])
+        chunks = split_text(row["clean_text"])
+        for chunk in chunks:
+            if len(chunk) < 1000:
+                continue
+            tmp = pd.DataFrame(
+                {
+                    "author_name": row["author_name"],
+                    "label": [label2id[row["author_name"]]],
+                    "category": row["category"],
+                    "case_name": row["case_name"],
+                    "url": [row["absolute_url"]],
+                    "text": [chunk],
+                }
+            )
+            opinions_split = pd.concat([opinions_split, tmp])
+    return opinions_split
 def chunk_data(data):
+    data_dict = {"text": []}
+    opinions_split = pd.DataFrame(data_dict)
+    chunks = split_text(data)
+    for chunk in chunks:
+        if len(chunk) < 1000:
+            continue
+        tmp = pd.DataFrame({"label": [200], "text": [chunk]})
+        opinions_split = pd.concat([opinions_split, tmp])
+    return opinions_split