File size: 5,278 Bytes
a3b1677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.probability import FreqDist
import requests 
from dotenv import load_dotenv
import os
import random

load_dotenv()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for (word,pos) in pos_tag(tokens) if word not in stop_words and len(word) > 2 and pos[:2] == 'NN']
    lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]
    return lemmatized_tokens


def extract_keywords_nltk(text:str,num_keywords_text:int=10):
    words = preprocess_text(text)
    # Frequency distribution of words
    freq_dist = FreqDist(words)
    # Get the 10 most common words
    most_common_words = freq_dist.most_common(num_keywords_text)
    keywords = [word for word, _ in most_common_words]
    return keywords


def request_image_analysis(api_url, api_client, api_key, image_url, num_keyword=10):
    #print("Image URL", image_url)
    params={'url':image_url,'num_keywords':num_keyword}
    try:
        response = requests.get(api_url, params=params, auth=(api_client, api_key))
        response.raise_for_status()      
        data = response.json()    
        #print(data)  
        return data
    except requests.exceptions.RequestException as e:
        return({"error":"Failed Image keywording call","message":e})

def parseImgLis(keywordlist:list):
    res = []
    for entry in keywordlist:
        res.append(entry["keyword"]) 
    return res

def analyseImages(images:list,req:dict):
    api_url = os.environ.get('AIGENENDPOINT')    
    api_key = os.environ.get('AIGENKEY')
    api_client = os.environ.get('AIGENCLIENT')

    selectedImages = random.sample(images,req["num_images"])
    #print(selectedImages,api_url)
    keywords = {}
    for i in range(0, len(selectedImages)):
        res= request_image_analysis(api_url,api_client,api_key,selectedImages[i])
        if("error" in res):
            return res
        else:
            keywords[str(i)]= parseImgLis(res['keywords'])
    return keywords





def generateKeywords(siteData,req,imagesPerSearch:int=8,subsetseed:int=42, categories:list=["text_only","combined_images"]):
    #For now keep it simple. Two seperate searches for images and 
    res = []
    queryCount = 0
    imageKeywords = []
    random.seed(subsetseed) #to avoid images being repetitive, instead querying more pages from the Pexels API
    #Make each image a category
    if('keywords_images' in siteData and "combined_images" in categories):
        for i in range(0,len(siteData["keywords_images"])):
            if(str(i) in siteData["keywords_images"]):
                imageKeywords+=siteData["keywords_images"][str(i)]

    #Handling missing keywords
    if("text_only" in categories and "keywords_text" not in siteData or len(siteData["keywords_text"])<req["num_query_keywords"]):
        #print("removing category text_only, due to missing text keywords")
        categories.remove("text_only")
    if("combined_images" in categories and len(imageKeywords)<req["num_query_keywords"]):
        #print("removing category text_only, due to missing text keywords")
        categories.remove("combined_images")
    if(len(categories)==0):
        print(imageKeywords, siteData)
        return {"error":"Not enough keywords to choose from"}
    queriesPerPage = req["result_images"]/imagesPerSearch
    currentCategory = int((req["page"]*queriesPerPage)%len(categories))
    #print(queryCount,queriesPerPage)
    while queryCount<queriesPerPage:
        catname = categories[currentCategory]
        uniquePage = int(queryCount/len(categories)*(req["page"]+1))
        if(catname=="text_only"):
            random.seed(subsetseed)
            query = {"keywords":random.sample(siteData['keywords_text'],req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
            res.append(query)
        elif(catname=="combined_images"):
            random.seed(subsetseed)
            query = {"keywords":random.sample(imageKeywords,req["num_query_keywords"]),"num_images":imagesPerSearch,"page":uniquePage, "id":catname}
            res.append(query)
        #print(query)
        queryCount+=1
        currentCategory+=1
        currentCategory=currentCategory%len(categories)
    return {"queries":res}

def analyseSite(siteData,req):
    res = {}
    if(req["use_images"] and siteData["images"] and len(siteData["images"])>0):
        imgkeywords = analyseImages(siteData["images"],req)
        res["keywords_images"]=imgkeywords
    if(req["use_text"] and siteData["text"] and len(siteData["text"])>0):
        siteText = " ".join(siteData["text"])
        keywords= extract_keywords_nltk(siteText,req["num_keywords_text"])
        res["keywords_text"]=keywords
    if(res=={}):
        return {"error":"analysis.py, problem encountered when analysing site data"}
    else:
        return generateKeywords(res,req,categories=["text_only","combined_images"])
        
        #try:
            
        #except:
        #    return {"error":"analysis.py, problem encountered while selecting keywords"}
        #return res