Spaces:
Sleeping
Sleeping
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import stopwords | |
| from nltk.tag import pos_tag | |
| from nltk.probability import FreqDist | |
| import requests | |
| from dotenv import load_dotenv | |
| import os | |
| import random | |
| load_dotenv() | |
| stop_words = set(stopwords.words('english')) | |
| def preprocess_text(text): | |
| wnl = WordNetLemmatizer() | |
| tokens = word_tokenize(text.lower()) | |
| filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN'] | |
| lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens] | |
| return lemmatized_tokens | |
| def extract_keywords_nltk(text:str,num_keywords_text:int=10): | |
| words = preprocess_text(text) | |
| # Frequency distribution of words | |
| freq_dist = FreqDist(words) | |
| # Get the 10 most common words | |
| most_common_words = freq_dist.most_common(num_keywords_text) | |
| keywords = [word for word, _ in most_common_words] | |
| return keywords | |
| def request_image_analysis(api_url, api_client, api_key, image_url, num_keyword=10): | |
| #print("Image URL", image_url) | |
| params={'url':image_url,'num_keywords':num_keyword} | |
| try: | |
| response = requests.get(api_url, params=params, auth=(api_client, api_key)) | |
| response.raise_for_status() | |
| data = response.json() | |
| #print(data) | |
| return data | |
| except requests.exceptions.RequestException as e: | |
| return({"error":"Failed Image keywording call","message":e}) | |
| def parseImgLis(keywordlist:list): | |
| res = [] | |
| for entry in keywordlist: | |
| res.append(entry["keyword"]) | |
| return res | |
| def analyseImages(images:list,req:dict): | |
| api_url = os.environ.get('AIGENENDPOINT') | |
| api_key = os.environ.get('AIGENKEY') | |
| api_client = os.environ.get('AIGENCLIENT') | |
| selectedImages = random.sample(images,req["num_images"]) | |
| #print(selectedImages,api_url) | |
| keywords = {} | |
| for i in range(0, len(selectedImages)): | |
| res= request_image_analysis(api_url,api_client,api_key,selectedImages[i]) | |
| if("error" in res): | |
| return res | |
| else: | |
| keywords[str(i)]= parseImgLis(res['keywords']) | |
| return keywords | |
| def generateKeywords(siteData,req,imagesPerSearch:int=8,subsetseed:int=42, categories:list=["text_only","combined_images"]): | |
| #For now keep it simple. Two seperate searches for images and | |
| res = [] | |
| queryCount = 0 | |
| imageKeywords = [] | |
| random.seed(subsetseed) #to avoid images being repetitive, instead querying more pages from the Pexels API | |
| #Make each image a category | |
| if('keywords_images' in siteData and "combined_images" in categories): | |
| for i in range(0,len(siteData["keywords_images"])): | |
| if(str(i) in siteData["keywords_images"]): | |
| imageKeywords+=siteData["keywords_images"][str(i)] | |
| #Handling missing keywords | |
| if("text_only" in categories and "keywords_text" not in siteData or len(siteData["keywords_text"])<req["num_query_keywords"]): | |
| #print("removing category text_only, due to missing text keywords") | |
| categories.remove("text_only") | |
| if("combined_images" in categories and len(imageKeywords)<req["num_query_keywords"]): | |
| #print("removing category text_only, due to missing text keywords") | |
| categories.remove("combined_images") | |
| if(len(categories)==0): | |
| print(imageKeywords, siteData) | |
| return {"error":"Not enough keywords to choose from"} | |
| queriesPerPage = req["result_images"]/imagesPerSearch | |
| currentCategory = int((req["page"]*queriesPerPage)%len(categories)) | |
| #print(queryCount,queriesPerPage) | |
| while queryCount<queriesPerPage: | |
| catname = categories[currentCategory] | |
| uniquePage = int(queryCount/len(categories)*(req["page"]+1)) | |
| if(catname=="text_only"): | |
| random.seed(subsetseed) | |
| query = {"keywords":random.sample(siteData['keywords_text'],req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname} | |
| res.append(query) | |
| elif(catname=="combined_images"): | |
| random.seed(subsetseed) | |
| query = {"keywords":random.sample(imageKeywords,req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname} | |
| res.append(query) | |
| #print(query) | |
| queryCount+=1 | |
| currentCategory+=1 | |
| currentCategory=currentCategory%len(categories) | |
| return {"queries":res} | |
| def analyseSite(siteData,req): | |
| res = {} | |
| if(req["use_images"] and siteData["images"] and len(siteData["images"])>0): | |
| imgkeywords = analyseImages(siteData["images"],req) | |
| res["keywords_images"]=imgkeywords | |
| if(req["use_text"] and siteData["text"] and len(siteData["text"])>0): | |
| siteText = " ".join(siteData["text"]) | |
| keywords= extract_keywords_nltk(siteText,req["num_keywords_text"]) | |
| res["keywords_text"]=keywords | |
| if(res=={}): | |
| return {"error":"analysis.py, problem encountered when analysing site data"} | |
| else: | |
| return generateKeywords(res,req,categories=["text_only","combined_images"]) | |
| #try: | |
| #except: | |
| # return {"error":"analysis.py, problem encountered while selecting keywords"} | |
| #return res |