from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.tag import pos_tag from nltk.probability import FreqDist import requests from dotenv import load_dotenv import os import random load_dotenv() stop_words = set(stopwords.words('english')) def preprocess_text(text): wnl = WordNetLemmatizer() tokens = word_tokenize(text.lower()) filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN'] lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens] return lemmatized_tokens def extract_keywords_nltk(text:str,num_keywords_text:int=10): words = preprocess_text(text) # Frequency distribution of words freq_dist = FreqDist(words) # Get the 10 most common words most_common_words = freq_dist.most_common(num_keywords_text) keywords = [word for word, _ in most_common_words] return keywords def request_image_analysis(api_url, api_client, api_key, image_url, num_keyword=10): #print("Image URL", image_url) params={'url':image_url,'num_keywords':num_keyword} try: response = requests.get(api_url, params=params, auth=(api_client, api_key)) response.raise_for_status() data = response.json() #print(data) return data except requests.exceptions.RequestException as e: return({"error":"Failed Image keywording call","message":e}) def parseImgLis(keywordlist:list): res = [] for entry in keywordlist: res.append(entry["keyword"]) return res def analyseImages(images:list,req:dict): api_url = os.environ.get('AIGENENDPOINT') api_key = os.environ.get('AIGENKEY') api_client = os.environ.get('AIGENCLIENT') selectedImages = random.sample(images,req["num_images"]) #print(selectedImages,api_url) keywords = {} for i in range(0, len(selectedImages)): res= request_image_analysis(api_url,api_client,api_key,selectedImages[i]) if("error" in res): return res else: keywords[str(i)]= parseImgLis(res['keywords']) return keywords def generateKeywords(siteData,req,imagesPerSearch:int=8,subsetseed:int=42, categories:list=["text_only","combined_images"]): #For now keep it simple. Two seperate searches for images and res = [] queryCount = 0 imageKeywords = [] random.seed(subsetseed) #to avoid images being repetitive, instead querying more pages from the Pexels API #Make each image a category if('keywords_images' in siteData and "combined_images" in categories): for i in range(0,len(siteData["keywords_images"])): if(str(i) in siteData["keywords_images"]): imageKeywords+=siteData["keywords_images"][str(i)] #Handling missing keywords if("text_only" in categories and "keywords_text" not in siteData or len(siteData["keywords_text"])0): imgkeywords = analyseImages(siteData["images"],req) res["keywords_images"]=imgkeywords if(req["use_text"] and siteData["text"] and len(siteData["text"])>0): siteText = " ".join(siteData["text"]) keywords= extract_keywords_nltk(siteText,req["num_keywords_text"]) res["keywords_text"]=keywords if(res=={}): return {"error":"analysis.py, problem encountered when analysing site data"} else: return generateKeywords(res,req,categories=["text_only","combined_images"]) #try: #except: # return {"error":"analysis.py, problem encountered while selecting keywords"} #return res