File size: 4,949 Bytes
a3b1677
ad9cc44
 
 
a3b1677
 
 
 
 
 
8e72555
a3b1677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e9e425
a3b1677
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Testing and setup suite for API functionality
from api.scraping import scrapePage
from api.analysis import analyseSite, parseImgLis
from api.searchImages import performQueries
import nltk

def setup():
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('wordnet')
    nltk.download('stopwords')

#Test data:
#Scraping
urllist = ["https://stackoverflow.com/questions/1052772/is-there-a-keyboard-shortcut-to-untab-move-a-block-of-code-to-the-left-in-ec","https://www.gocusdom.com/","https://en.wikipedia.org/wiki/Heron","https://brandlume.com/12-proven-ways-to-make-your-website-stand-out/"]
exampleReq1 = {"url": urllist[0],"use_images": False,"use_text":True,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
exampleReq2 = {"url": urllist[1],"use_images": True,"use_text":True,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
exampleReq3 = {"url": urllist[2],"use_images": True,"use_text":True,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
exampleReq4 = {"url": urllist[3],"use_images": True,"use_text":False,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
exampleReq0 = {"url": "invalidurl","use_images": True,"use_text":True,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10,"num_query_keywords":5,"result_images":24}
exampleReq5 = {"url": urllist[0],"use_images": False,"use_text":False,"num_images":1,"page": 0,"num_keywords_text": 10,"num_keywords_images": 10, "num_query_keywords":5,"result_images":24}

errorFreeReqs = [exampleReq1, exampleReq2, exampleReq3, exampleReq4]


def pageScraperTest():
    res = scrapePage(exampleReq0)
    assert(res=={"error": "scraping.py: url is not recognized as a valid url."})

    res1 = scrapePage(exampleReq1)
    assert("images" not in res1)
    assert(len(res1['text'])>0)

    res2 = scrapePage(exampleReq2)
    assert(len(res2["images"])>1)
    assert(len(res2['text'])>0)

    res3 = scrapePage(exampleReq3)
    print(res3)
    assert(len(res3["images"])>1)
    assert(len(res3['text'])>0)
    res4 = scrapePage(exampleReq4)
    assert(len(res4["images"])>1)
    assert("text" not in res4)
    print("Successfully passed scraping suite")

    return [res1,res2,res3,res4]
#PART 2: Analysis

exampleRetPreParse = {'keywords_images': {'0': [{'keyword': 'sign', 'score': 0.6898373278547406}, {'keyword': 'symbol', 'score': 0.4472937186719514}, {'keyword': 'design', 'score': 0.2713939913056557}, {'keyword': 'architecture', 'score': 0.2713049676643986}, {'keyword': 'illustration', 'score': 0.26296517362059374}, {'keyword': 'success', 'score': 0.15815140125121907}, {'keyword': 'night', 'score': 0.1436041312950442}, {'keyword': 'icon', 'score': 0.14356085864611598}, {'keyword': 'finance', 'score': 0.13994914050650978}, {'keyword': 'ideas', 'score': 0.13742065469130105}]}, 'keywords_text': ['web', 'design', 'website', 'cusdom', 'business', 
'project', 'experience', 'user', 'college', 'startup']}

def analysisTest(scraperData):
    #Data if use_image and use_text are set to 0
    res = analyseSite(scraperData[0],exampleReq5)
    assert(res=={"error":"analysis.py, problem encountered when analysing site data"})
    res2 = analyseSite(scraperData[0],exampleReq1)
    assert(len(res2["queries"])==3)
    res3 = analyseSite(scraperData[1],exampleReq2)
    assert("error" not in res3)
    res4 = analyseSite(scraperData[2],exampleReq3)
    assert("error" not in res4)
    res5 = analyseSite(scraperData[3],exampleReq4)
    assert("error" not in res5)
    print("passed analysis testing suite")
    return[res2,res3,res4,res5]
    


def queryTest(req):
    res1 = performQueries(req[0])
    assert("error" not in res1)
    res2 = performQueries(req[1])
    assert("error" not in res2)
    res3 = performQueries(req[2])
    assert("error" not in res3)
    res4 = performQueries(req[3])
    assert("error" not in res4)
    print("Passed image search suite")

def completeTest(req):
    scrapedData = scrapePage(req)
    if("error" in scrapedData):
        print(scrapedData)
        return scrapedData
    queries = analyseSite(scrapedData,req)
    if("error") in queries:
        print(queries)
        return queries
    res = performQueries(queries)
    if("error") in res:
        print(res)
        return res
    

if __name__ == '__main__':
    setup()
    print("starting error free requests")
    for req in errorFreeReqs:
        print("requesting", req["url"])
        completeTest(req)
    print("Finished error free requests, if no errors are printed above we are good to go.")

    #print("completed example test")
    #scraperData = pageScraperTest()
    #Analysis
    #queries = analysisTest(scraperData)
    #queryTest(queries)
    #