Spaces:
Running
Running
| import subprocess | |
| import sys | |
| import re | |
| import pandas as pd | |
| try: | |
| import eyecite | |
| except ImportError: | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"]) | |
| finally: | |
| from eyecite import find, clean | |
| # @title | |
| def full_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.year: | |
| pattern = r"\([^)]*{}\)".format( | |
| citation.metadata.year | |
| ) # Matches any word that ends with "year" | |
| text = re.sub(pattern, "", text) | |
| if citation.metadata.pin_cite: | |
| text = text.replace(citation.metadata.pin_cite, "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| if citation.metadata.plaintiff: | |
| text = text.replace( | |
| f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "" | |
| ) | |
| publisher_date = " ".join( | |
| i for i in (citation.metadata.court, citation.metadata.year) if i | |
| ) | |
| if publisher_date: | |
| text = text.replace(f"{publisher_date}", "") | |
| if citation.metadata.extra: | |
| text = text.replace(citation.metadata.extra, "") | |
| return text | |
| def supra_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.pin_cite: | |
| text = text.replace(citation.metadata.pin_cite, "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| if citation.metadata.antecedent_guess: | |
| text = text.replace(citation.metadata.antecedent_guess, "") | |
| return text | |
| def short_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| if citation.metadata.year: | |
| pattern = r"\([^)]*{}\)".format(citation.metadata.year) | |
| if citation.metadata.antecedent_guess: | |
| text = text.replace(citation.metadata.antecedent_guess, "") | |
| return text | |
| def id_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| if citation.metadata.pin_cite: | |
| text = text.replace(citation.metadata.pin_cite, "") | |
| return text | |
| def unknown_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| return text | |
| def full_law_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| return text | |
| def full_journal_case(citation, text): | |
| text = text.replace(citation.matched_text(), "") | |
| if citation.metadata.year: | |
| pattern = r"\([^)]*{}\)".format( | |
| citation.metadata.year | |
| ) # Matches any word that ends with "year" | |
| text = re.sub(pattern, "", text) | |
| if citation.metadata.pin_cite: | |
| text = text.replace(citation.metadata.pin_cite, "") | |
| if citation.metadata.parenthetical: | |
| text = text.replace(f"({citation.metadata.parenthetical})", "") | |
| return text | |
| def all_commas(text: str) -> str: | |
| return re.sub(r"\,+", ",", text) | |
| def all_dots(text: str) -> str: | |
| return re.sub(r"\.+", ".", text) | |
| functions_dict = { | |
| "FullCaseCitation": full_case, | |
| "SupraCitation": supra_case, | |
| "ShortCaseCitation": short_case, | |
| "IdCitation": id_case, | |
| "UnknownCitation": unknown_case, | |
| "FullLawCitation": full_law_case, | |
| "FullJournalCitation": full_journal_case, | |
| } | |
| # @title | |
| def remove_citations(input_text): | |
| # clean text | |
| plain_text = clean.clean_text( | |
| input_text, ["html", "inline_whitespace", "underscores"] | |
| ) | |
| # remove citations | |
| found_citations = find.get_citations(plain_text) | |
| for citation in found_citations: | |
| plain_text = functions_dict[citation.__class__.__name__](citation, plain_text) | |
| # clean text | |
| plain_text = clean.clean_text( | |
| plain_text, | |
| ["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots], | |
| ) | |
| plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"]) | |
| pattern = r"\*?\d*\s*I+\n" | |
| plain_text = re.sub(pattern, "", plain_text) | |
| pattern = r"\s[,.]" | |
| plain_text = re.sub(pattern, "", plain_text) | |
| return plain_text | |
| def split_text(text): | |
| words = text.split() | |
| chunks = [] | |
| for i in range(0, len(words), 420): | |
| chunks.append(" ".join(words[i : i + 430])) | |
| return chunks | |
| # @title | |
| def chunk_text_to_paragraphs(text): | |
| paragraphs = text.split("\n") # Split by empty line | |
| # Remove leading and trailing whitespace from each paragraph | |
| paragraphs = [p.strip() for p in paragraphs] | |
| return paragraphs | |
| # @title | |
| def split_data(data, id2label, label2id): | |
| data_dict = { | |
| "author_name": [], | |
| "label": [], | |
| "category": [], | |
| "case_name": [], | |
| "url": [], | |
| "text": [], | |
| } | |
| opinions_split = pd.DataFrame(data_dict) | |
| opinions_split["label"] = opinions_split["label"].astype(int) | |
| for index, row in data.iterrows(): | |
| # chunks = chunk_text_to_paragraphs(row['text']) | |
| chunks = split_text(row["clean_text"]) | |
| for chunk in chunks: | |
| if len(chunk) < 1000: | |
| continue | |
| tmp = pd.DataFrame( | |
| { | |
| "author_name": row["author_name"], | |
| "label": [label2id[row["author_name"]]], | |
| "category": row["category"], | |
| "case_name": row["case_name"], | |
| "url": [row["absolute_url"]], | |
| "text": [chunk], | |
| } | |
| ) | |
| opinions_split = pd.concat([opinions_split, tmp]) | |
| return opinions_split | |
| def chunk_data(data): | |
| data_dict = {"text": []} | |
| opinions_split = pd.DataFrame(data_dict) | |
| chunks = split_text(data) | |
| for chunk in chunks: | |
| # if len(chunk) < 1000: | |
| # continue | |
| tmp = pd.DataFrame({"label": [200], "text": [chunk]}) | |
| opinions_split = pd.concat([opinions_split, tmp]) | |
| return opinions_split | |