Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pypandoc | |
| import glob | |
| import shutil | |
| import os | |
| import tqdm | |
| from huggingface_hub import snapshot_download | |
| from huggingface_hub import HfApi | |
| import tempfile | |
| import re | |
| from pdfminer.high_level import extract_text | |
| import time | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| api = HfApi() | |
| #from docx import Document | |
| #document = Document() | |
| #document.add_heading('Labels for ', level=1) | |
| RESULTS_FOLDER = "./results" | |
| DOC_FOLDER = snapshot_download("claudiag/atlas", token=HF_TOKEN, repo_type="dataset") | |
| CAT_TO_CODEWORDS = { | |
| "Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"], | |
| "Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"], | |
| "Similarities": ["similarity", "same", "similar", "equal", "related", "together"], | |
| "Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"], | |
| "Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"], | |
| "Courage": ["courage", "brave", "dare", "step", "determine"], | |
| "Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"], | |
| "Coherence": ["coherent", "align", "incoherent", "consistent"], | |
| "Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"], | |
| "Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"], | |
| "Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"], | |
| "Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"], | |
| "Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"], | |
| "The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"], | |
| } | |
| CATEGORIES = CAT_TO_CODEWORDS.keys() | |
| def retrieve_lines(filename): | |
| extension = filename.split(".")[-1] | |
| if extension == "pdf": | |
| text = extract_text(filename) | |
| lines = text.split("\n") | |
| elif extension in ["docx", "doc"]: | |
| with tempfile.TemporaryDirectory() as tmpdirname: | |
| outfile = os.path.join(tmpdirname, "temp.txt") | |
| pypandoc.convert_file(filename, 'plain', outputfile=outfile) | |
| with open(outfile, "r") as f: | |
| lines = f.readlines() | |
| lines = [l.strip() for l in lines] | |
| lines = " ".join(lines) | |
| lines = lines.split(".") | |
| return lines | |
| def match_code(lines, codewords): | |
| match_dict = {} | |
| keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b') | |
| for i, _ in enumerate(lines): | |
| line = lines[i] | |
| matches = list(keywords_to_match.finditer(line)) | |
| if len(matches) > 0: | |
| for m in matches: | |
| span = m.span() | |
| line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:] | |
| match_dict[i] = " ".join(line.rstrip().lstrip().split()) | |
| return match_dict | |
| def main(filename, codewords_mapping): | |
| lines = retrieve_lines(filename) | |
| files = [] | |
| for label, codewords in codewords_mapping.items(): | |
| match = match_code(lines, codewords) | |
| out = "" | |
| if len(match) > 0: | |
| result_file = ".".join(['_'.join(label.split()), "result", "txt"]) | |
| result_file = os.path.join(RESULTS_FOLDER, result_file) | |
| if not os.path.exists(result_file): | |
| out += f"# Code: {label}\n" | |
| out += 25 * "=" | |
| out += "\n\n" | |
| out += f"## Source: {'/'.join(filename.split('/')[-2:])}\n" | |
| out += 25 * "-" | |
| out += "\n" | |
| out += "\n".join([f'-{v}' for k,v in match.items()]) | |
| out += "\n" | |
| out += 25 * "-" | |
| out += "\n\n" | |
| with open(result_file, "a") as f: | |
| f.write(out) | |
| files.append(result_file) | |
| return files | |
| def convert(*keywords): | |
| codewords_mapping = {k: v.split(",") for k,v in zip(CATEGORIES, keywords)} | |
| num_files = 0 | |
| shutil.rmtree(RESULTS_FOLDER, ignore_errors=True) | |
| os.makedirs(RESULTS_FOLDER) | |
| result_files = [] | |
| folders = glob.glob(os.path.join(DOC_FOLDER, "*")) | |
| for folder in tqdm.tqdm(folders): | |
| all_files = tqdm.tqdm(glob.glob(f"{folder}/*")) | |
| num_files += len(all_files) | |
| for filename in all_files: | |
| try: | |
| result_files += main(filename, codewords_mapping) | |
| except Exception as e: | |
| print(f"{filename} not working because \n {e}") | |
| result_files = list(set(result_files)) | |
| api.upload_folder( | |
| repo_id="patrickvonplaten/atlas", | |
| folder_path=RESULTS_FOLDER, | |
| path_in_repo=f"results_{time.time()}", | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| ) | |
| return f"Done. Processed {num_files} files." | |
| inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k], value=",".join(CAT_TO_CODEWORDS[k])) for k in CATEGORIES] | |
| iface = gr.Interface( | |
| fn=convert, inputs=inputs, outputs="text") | |
| iface.launch() | |