Spaces:
Sleeping
Sleeping
Update main2.py
Browse files
main2.py
CHANGED
|
@@ -1,28 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
-
from transformers import pipeline
|
| 3 |
-
|
| 4 |
-
# Create a new FastAPI app instance
|
| 5 |
app = FastAPI()
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# Define a function to handle the GET request at `/generate`
|
| 14 |
-
# The generate() function is defined as a FastAPI route that takes a
|
| 15 |
-
# string parameter called text. The function generates text based on the # input using the pipeline() object, and returns a JSON response
|
| 16 |
-
# containing the generated text under the key "output"
|
| 17 |
-
@app.get("/generate")
|
| 18 |
-
def generate(text: str):
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Wed Jan 4 05:56:28 2023
|
| 4 |
+
|
| 5 |
+
@author: dreji18
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# loading the packages
|
| 9 |
+
from rake_nltk import Rake
|
| 10 |
+
import wikipedia
|
| 11 |
+
from rank_bm25 import BM25Okapi
|
| 12 |
+
import torch
|
| 13 |
+
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
|
| 14 |
from fastapi import FastAPI
|
|
|
|
|
|
|
|
|
|
| 15 |
app = FastAPI()
|
| 16 |
+
|
| 17 |
+
@app.get("/")
|
| 18 |
+
def read_root():
|
| 19 |
+
return {"Hello": "World"}
|
| 20 |
+
|
| 21 |
+
# keyword extraction function
|
| 22 |
+
def keyword_extractor(query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
+
Rake has some features:
|
| 25 |
+
1. convert automatically to lower case
|
| 26 |
+
2. extract important key phrases
|
| 27 |
+
3. it will extract combine words also (eg. Deep Learning, Capital City)
|
| 28 |
"""
|
| 29 |
+
r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
|
| 30 |
+
r.extract_keywords_from_text(query)
|
| 31 |
+
keywords = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
|
| 32 |
+
return keywords
|
| 33 |
+
|
| 34 |
+
# data collection using wikepedia
|
| 35 |
+
def data_collection(search_words):
|
| 36 |
+
"""wikipedia"""
|
| 37 |
+
search_query = ' '.join(search_words)
|
| 38 |
+
wiki_pages = wikipedia.search(search_query, results = 5)
|
| 39 |
+
|
| 40 |
+
information_list = []
|
| 41 |
+
pages_list = []
|
| 42 |
+
for i in wiki_pages:
|
| 43 |
+
try:
|
| 44 |
+
info = wikipedia.summary(i)
|
| 45 |
+
if any(word in info.lower() for word in search_words):
|
| 46 |
+
information_list.append(info)
|
| 47 |
+
pages_list.append(i)
|
| 48 |
+
except:
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
original_info = information_list
|
| 52 |
+
information_list = [item[:1000] for item in information_list] # limiting the word len to 512
|
| 53 |
+
|
| 54 |
+
return information_list, pages_list, original_info
|
| 55 |
+
|
| 56 |
+
# document ranking function
|
| 57 |
+
def document_ranking(documents, query, n):
|
| 58 |
+
"""BM25"""
|
| 59 |
+
try:
|
| 60 |
+
tokenized_corpus = [doc.split(" ") for doc in documents]
|
| 61 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
| 62 |
+
tokenized_query = query.split(" ")
|
| 63 |
+
doc_scores = bm25.get_scores(tokenized_query)
|
| 64 |
+
datastore = bm25.get_top_n(tokenized_query, documents, n)
|
| 65 |
+
except:
|
| 66 |
+
pass
|
| 67 |
+
return datastore
|
| 68 |
+
|
| 69 |
+
def qna(context, question):
|
| 70 |
+
"""DistilBert"""
|
| 71 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
|
| 72 |
+
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
|
| 73 |
+
encoding = tokenizer.encode_plus(question, context)
|
| 74 |
+
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
|
| 75 |
+
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
|
| 76 |
+
ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
|
| 77 |
+
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)
|
| 78 |
+
answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
|
| 79 |
+
|
| 80 |
+
return answer_tokens_to_string
|
| 81 |
+
|
| 82 |
+
@app.get("/predict")
|
| 83 |
+
def answergen(search_string: str):
|
| 84 |
+
try:
|
| 85 |
+
keyword_list = keyword_extractor(search_string)
|
| 86 |
+
information, pages, original_data = data_collection(keyword_list)
|
| 87 |
+
datastore = document_ranking(information, search_string, 3)
|
| 88 |
+
|
| 89 |
+
answers_list = []
|
| 90 |
+
for i in range(len(datastore)):
|
| 91 |
+
result = qna(datastore[i], search_string)
|
| 92 |
+
answers_list.append(result)
|
| 93 |
+
|
| 94 |
+
return {"answer 1": answers_list[0],
|
| 95 |
+
"answer 2": answers_list[1],
|
| 96 |
+
"answer 3": answers_list[2]}
|
| 97 |
+
except:
|
| 98 |
+
return {"sorry couldn't process the request"}
|
| 99 |
+
|
| 100 |
+
#uvicorn app:app --port 8000 --reload
|
| 101 |
+
|
| 102 |
+
#%%
|