sneakpic / api /analysis.py
efraimdahl
rewrite history because HF can't actually delete binary files
a3b1677
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.probability import FreqDist
import requests
from dotenv import load_dotenv
import os
import random
load_dotenv()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
wnl = WordNetLemmatizer()
tokens = word_tokenize(text.lower())
filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']
lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]
return lemmatized_tokens
def extract_keywords_nltk(text:str,num_keywords_text:int=10):
words = preprocess_text(text)
# Frequency distribution of words
freq_dist = FreqDist(words)
# Get the 10 most common words
most_common_words = freq_dist.most_common(num_keywords_text)
keywords = [word for word, _ in most_common_words]
return keywords
def request_image_analysis(api_url, api_client, api_key, image_url, num_keyword=10):
#print("Image URL", image_url)
params={'url':image_url,'num_keywords':num_keyword}
try:
response = requests.get(api_url, params=params, auth=(api_client, api_key))
response.raise_for_status()
data = response.json()
#print(data)
return data
except requests.exceptions.RequestException as e:
return({"error":"Failed Image keywording call","message":e})
def parseImgLis(keywordlist:list):
res = []
for entry in keywordlist:
res.append(entry["keyword"])
return res
def analyseImages(images:list,req:dict):
api_url = os.environ.get('AIGENENDPOINT')
api_key = os.environ.get('AIGENKEY')
api_client = os.environ.get('AIGENCLIENT')
selectedImages = random.sample(images,req["num_images"])
#print(selectedImages,api_url)
keywords = {}
for i in range(0, len(selectedImages)):
res= request_image_analysis(api_url,api_client,api_key,selectedImages[i])
if("error" in res):
return res
else:
keywords[str(i)]= parseImgLis(res['keywords'])
return keywords
def generateKeywords(siteData,req,imagesPerSearch:int=8,subsetseed:int=42, categories:list=["text_only","combined_images"]):
#For now keep it simple. Two seperate searches for images and
res = []
queryCount = 0
imageKeywords = []
random.seed(subsetseed) #to avoid images being repetitive, instead querying more pages from the Pexels API
#Make each image a category
if('keywords_images' in siteData and "combined_images" in categories):
for i in range(0,len(siteData["keywords_images"])):
if(str(i) in siteData["keywords_images"]):
imageKeywords+=siteData["keywords_images"][str(i)]
#Handling missing keywords
if("text_only" in categories and "keywords_text" not in siteData or len(siteData["keywords_text"])<req["num_query_keywords"]):
#print("removing category text_only, due to missing text keywords")
categories.remove("text_only")
if("combined_images" in categories and len(imageKeywords)<req["num_query_keywords"]):
#print("removing category text_only, due to missing text keywords")
categories.remove("combined_images")
if(len(categories)==0):
print(imageKeywords, siteData)
return {"error":"Not enough keywords to choose from"}
queriesPerPage = req["result_images"]/imagesPerSearch
currentCategory = int((req["page"]*queriesPerPage)%len(categories))
#print(queryCount,queriesPerPage)
while queryCount<queriesPerPage:
catname = categories[currentCategory]
uniquePage = int(queryCount/len(categories)*(req["page"]+1))
if(catname=="text_only"):
random.seed(subsetseed)
query = {"keywords":random.sample(siteData['keywords_text'],req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
res.append(query)
elif(catname=="combined_images"):
random.seed(subsetseed)
query = {"keywords":random.sample(imageKeywords,req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
res.append(query)
#print(query)
queryCount+=1
currentCategory+=1
currentCategory=currentCategory%len(categories)
return {"queries":res}
def analyseSite(siteData,req):
res = {}
if(req["use_images"] and siteData["images"] and len(siteData["images"])>0):
imgkeywords = analyseImages(siteData["images"],req)
res["keywords_images"]=imgkeywords
if(req["use_text"] and siteData["text"] and len(siteData["text"])>0):
siteText = " ".join(siteData["text"])
keywords= extract_keywords_nltk(siteText,req["num_keywords_text"])
res["keywords_text"]=keywords
if(res=={}):
return {"error":"analysis.py, problem encountered when analysing site data"}
else:
return generateKeywords(res,req,categories=["text_only","combined_images"])
#try:
#except:
# return {"error":"analysis.py, problem encountered while selecting keywords"}
#return res