from fastapi import FastAPI import spacy import json from typing import List from bs4 import BeautifulSoup from markdown import markdown TAG_PROBABILITY_THRESHOLD = 0.5 app = FastAPI() nlp_text = spacy.load("./spacy_text_model") nlp_code = spacy.load("./spacy_code_model") with open('selected_tags.json', 'r') as openfile: selected_tags = json.load(openfile) def preprocess(texts): tokens = [] removal = ['PUNCT', 'SPACE', 'NUM', 'SYM'] cleaned_texts = [] for summary in nlp_text.pipe(texts, disable=["transformer", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]): question_tokens = [] for token in summary: if token.pos_ not in removal and token.is_alpha and len(question_tokens) < 512: question_tokens.append(token.lower_) cleaned_texts.append(" ".join(question_tokens)) return cleaned_texts def get_text_and_code(body): html = markdown(body) bs = BeautifulSoup(html) codes = bs.findAll('code') code = '\n'.join([x.text for x in codes]) for x in codes: x.decompose() text = '\n'.join(bs.findAll(text=True)) return text, code @app.post("/infer_tags") async def infer_tags(questions: List[str]): results = [] texts = [] codes = [] for question in questions: text, code = get_text_and_code(question) texts.append(text) codes.append(code) texts_preprocessed = preprocess(texts) codes_preprocessed = preprocess(codes) pred_text = [] pred_code = [] for summary in nlp_text.pipe(texts_preprocessed): if summary.text != '': pred_text.append(summary.cats) else: pred_text.append(dict.fromkeys(selected_tags, 0)) for summary in nlp_code.pipe(codes_preprocessed): if summary.text != '': pred_code.append(summary.cats) else: pred_code.append(dict.fromkeys(selected_tags, 0)) text_tags = [[x for x in selected_tags if y[x] > TAG_PROBABILITY_THRESHOLD] for y in pred_text] code_tags = [[x for x in selected_tags if y[x] > TAG_PROBABILITY_THRESHOLD] for y in pred_code] union_tags = [] for i in range(len(text_tags)): union_tags.append(list(set(text_tags[i]) | set(code_tags[i]))) return union_tags