Spaces:
Sleeping
Sleeping
File size: 5,278 Bytes
a3b1677 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.probability import FreqDist
import requests
from dotenv import load_dotenv
import os
import random
load_dotenv()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
wnl = WordNetLemmatizer()
tokens = word_tokenize(text.lower())
filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']
lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]
return lemmatized_tokens
def extract_keywords_nltk(text:str,num_keywords_text:int=10):
words = preprocess_text(text)
# Frequency distribution of words
freq_dist = FreqDist(words)
# Get the 10 most common words
most_common_words = freq_dist.most_common(num_keywords_text)
keywords = [word for word, _ in most_common_words]
return keywords
def request_image_analysis(api_url, api_client, api_key, image_url, num_keyword=10):
#print("Image URL", image_url)
params={'url':image_url,'num_keywords':num_keyword}
try:
response = requests.get(api_url, params=params, auth=(api_client, api_key))
response.raise_for_status()
data = response.json()
#print(data)
return data
except requests.exceptions.RequestException as e:
return({"error":"Failed Image keywording call","message":e})
def parseImgLis(keywordlist:list):
res = []
for entry in keywordlist:
res.append(entry["keyword"])
return res
def analyseImages(images:list,req:dict):
api_url = os.environ.get('AIGENENDPOINT')
api_key = os.environ.get('AIGENKEY')
api_client = os.environ.get('AIGENCLIENT')
selectedImages = random.sample(images,req["num_images"])
#print(selectedImages,api_url)
keywords = {}
for i in range(0, len(selectedImages)):
res= request_image_analysis(api_url,api_client,api_key,selectedImages[i])
if("error" in res):
return res
else:
keywords[str(i)]= parseImgLis(res['keywords'])
return keywords
def generateKeywords(siteData,req,imagesPerSearch:int=8,subsetseed:int=42, categories:list=["text_only","combined_images"]):
#For now keep it simple. Two seperate searches for images and
res = []
queryCount = 0
imageKeywords = []
random.seed(subsetseed) #to avoid images being repetitive, instead querying more pages from the Pexels API
#Make each image a category
if('keywords_images' in siteData and "combined_images" in categories):
for i in range(0,len(siteData["keywords_images"])):
if(str(i) in siteData["keywords_images"]):
imageKeywords+=siteData["keywords_images"][str(i)]
#Handling missing keywords
if("text_only" in categories and "keywords_text" not in siteData or len(siteData["keywords_text"])<req["num_query_keywords"]):
#print("removing category text_only, due to missing text keywords")
categories.remove("text_only")
if("combined_images" in categories and len(imageKeywords)<req["num_query_keywords"]):
#print("removing category text_only, due to missing text keywords")
categories.remove("combined_images")
if(len(categories)==0):
print(imageKeywords, siteData)
return {"error":"Not enough keywords to choose from"}
queriesPerPage = req["result_images"]/imagesPerSearch
currentCategory = int((req["page"]*queriesPerPage)%len(categories))
#print(queryCount,queriesPerPage)
while queryCount<queriesPerPage:
catname = categories[currentCategory]
uniquePage = int(queryCount/len(categories)*(req["page"]+1))
if(catname=="text_only"):
random.seed(subsetseed)
query = {"keywords":random.sample(siteData['keywords_text'],req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
res.append(query)
elif(catname=="combined_images"):
random.seed(subsetseed)
query = {"keywords":random.sample(imageKeywords,req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
res.append(query)
#print(query)
queryCount+=1
currentCategory+=1
currentCategory=currentCategory%len(categories)
return {"queries":res}
def analyseSite(siteData,req):
res = {}
if(req["use_images"] and siteData["images"] and len(siteData["images"])>0):
imgkeywords = analyseImages(siteData["images"],req)
res["keywords_images"]=imgkeywords
if(req["use_text"] and siteData["text"] and len(siteData["text"])>0):
siteText = " ".join(siteData["text"])
keywords= extract_keywords_nltk(siteText,req["num_keywords_text"])
res["keywords_text"]=keywords
if(res=={}):
return {"error":"analysis.py, problem encountered when analysing site data"}
else:
return generateKeywords(res,req,categories=["text_only","combined_images"])
#try:
#except:
# return {"error":"analysis.py, problem encountered while selecting keywords"}
#return res |