Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

App Files Files Community

bertugmirasyedi commited on Mar 30, 2023

Commit

b402f97

1 Parent(s): 94b6bc9

Complete overhaul

Browse files

Files changed (1) hide show

app.py +433 -239

app.py CHANGED Viewed

@@ -1,22 +1,33 @@
 from fastapi import FastAPI
 # Define the FastAPI app
 app = FastAPI(docs_url="/")
-@app.get("/search={query}&similarity={similarity}")
-def search(query, similarity=False):
     import time
     import requests
     start_time = time.time()
-    # Set the API endpoint and query parameters
-    url = "https://www.googleapis.com/books/v1/volumes"
-    params = {"q": str(query), "printType": "books", "maxResults": 1}
-    # Send a GET request to the API with the specified parameters
-    response = requests.get(url, params=params)
     # Initialize the lists to store the results
     titles = []
     authors = []
@@ -24,230 +35,255 @@ def search(query, similarity=False):
     descriptions = []
     images = []
-    # Parse the response JSON and append the results
-    data = response.json()
-    for item in data["items"]:
-        volume_info = item["volumeInfo"]
-        try:
-            titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
-        except KeyError:
-            titles.append(volume_info["title"])
-        try:
-            descriptions.append(volume_info["description"])
-        except KeyError:
-            descriptions.append("Null")
-        try:
-            publishers.append(volume_info["publisher"])
-        except KeyError:
-            publishers.append("Null")
-        try:
-            authors.append(volume_info["authors"][0])
-        except KeyError:
-            authors.append("Null")
-        try:
-            images.append(volume_info["imageLinks"]["thumbnail"])
-        except KeyError:
-            images.append(
-                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
-            )
-    ### Openalex ###
-    import pyalex
-    from pyalex import Works
-    # Add email to the config
-    pyalex.config.email = "ber2mir@gmail.com"
-    # Define a pager object with the same query
-    pager = Works().search(str(query)).paginate(per_page=1, n_max=1)
-    # Generate a list of the results
-    openalex_results = list(pager)
-    # Get the titles, descriptions, and publishers and append them to the lists
-    for result in openalex_results[0]:
-        try:
-            titles.append(result["title"])
-        except KeyError:
-            titles.append("Null")
-        try:
-            descriptions.append(result["abstract"])
-        except KeyError:
-            descriptions.append("Null")
-        try:
-            publishers.append(result["host_venue"]["publisher"])
-        except KeyError:
-            publishers.append("Null")
-        try:
-            authors.append(result["authorships"][0]["author"]["display_name"])
-        except KeyError:
-            authors.append("Null")
-        images.append(
-            "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
-        )
-    ### OpenAI ###
-    import openai
-    # Set the OpenAI API key
-    openai.api_key = "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"
-    # Create ChatGPT query
-    chatgpt_response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {
-                "role": "system",
-                "content": "You are a librarian. You are helping a patron find a book.",
-            },
-            {
-                "role": "user",
-                "content": f"Recommend me 1 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
-            },
-        ],
-    )
-    # Split the response into a list of results
-    chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
-        2::2
-    ]
-    # Define a function to parse the results
-    def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
-        # Create a dict to store the key-value pairs
-        parsed_result = {}
-        for key in ordered_keys:
-            # Split the result string by the key and append the value to the list
-            if key != ordered_keys[-1]:
-                parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
-            else:
-                parsed_result[key] = result.split(f"{key}: ")[1]
-        return parsed_result
-    ordered_keys = ["Title", "Author", "Publisher", "Summary"]
-    for result in chatgpt_results:
-        try:
-            # Parse the result
-            parsed_result = parse_result(result, ordered_keys=ordered_keys)
-            # Append the parsed result to the lists
-            titles.append(parsed_result["Title"])
-            authors.append(parsed_result["Author"])
-            publishers.append(parsed_result["Publisher"])
-            descriptions.append(parsed_result["Summary"])
             images.append(
                 "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
             )
-        # In case the OpenAI API hits the limit
-        except IndexError:
-            break
-    ### Prediction ###
-    from transformers import (
-        AutoTokenizer,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSequenceClassification,
-        pipeline,
-    )
-    from sentence_transformers import SentenceTransformer, CrossEncoder
-    from sentence_transformers.util import cos_sim, dot_score
-    # Load the classifiers
-    # classifier = TextClassifier.load(
-    #    "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
-    # )
-    # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
-    # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
     # Combine title, description, and publisher into a single string
     combined_data = [
-        f"{title} {description} {publisher}"
         for title, description, publisher in zip(titles, descriptions, publishers)
     ]
-    # Prepare the Sentence object
-    # sentences = [
-    #    Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
-    # ]
-    # Classify the sentences
-    # classifier.predict(sentences)
-    # Get the predicted labels
-    # classes = [sentence.labels for sentence in sentences]
-    # Define the summarizer model and tokenizer
-    sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
-    # sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
-    sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
-    summarizer_pipeline = pipeline(
-        "summarization",
-        model=sum_model,
-        tokenizer=sum_tokenizer,
-        batch_size=64,
-    )
-    # Define the zero-shot classifier
-    zs_tokenizer = AutoTokenizer.from_pretrained(
-        "sileod/deberta-v3-base-tasksource-nli"
-    )
-    # Quickfix for the tokenizer
-    # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
-    zs_model = AutoModelForSequenceClassification.from_pretrained(
-        "sileod/deberta-v3-base-tasksource-nli"
-    )
-    zs_classifier = pipeline(
-        "zero-shot-classification",
-        model=zs_model,
-        tokenizer=zs_tokenizer,
-        batch_size=64,
-        hypothesis_template="This book is {}.",
-        multi_label=True,
-    )
-    # Summarize the descriptions
-    summaries = [
-        summarizer_pipeline(description[0:1024])
-        if (description != None)
-        else [{"summary_text": "Null"}]
-        for description in descriptions
-    ]
-    # Predict the level of the book
-    candidate_labels = [
-        "Introductory",
-        "Advanced",
-        "Academic",
-        "Not Academic",
-        "Manual",
-    ]
-    # Get the predicted labels
-    classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
-    # Calculate the elapsed time
-    end_time = time.time()
-    runtime = f"{end_time - start_time:.2f} seconds"
-    # Calculate the similarity between the books
-    if similarity:
         from sentence_transformers import util
         sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
@@ -255,36 +291,194 @@ def search(query, similarity=False):
             combined_data, convert_to_tensor=True
         )
         similar_books = []
-        for i in range(len(titles)):
             current_embedding = book_embeddings[i]
             similarity_sorted = util.semantic_search(
-                current_embedding, book_embeddings, top_k=20
             )
             similar_books.append(
                 {
                     "sorted_by_similarity": similarity_sorted[0][1:],
                 }
             )
-    # Create a list of dictionaries to store the results
-    results = []
-    for i in range(len(titles)):
-        results.append(
-            {
-                "id": i,
-                "title": titles[i],
-                "author": authors[i],
-                "publisher": publishers[i],
-                "image_link": images[i],
-                "labels": classes[i]["labels"][0:2],
-                "label_confidences": classes[i]["scores"][0:2],
-                "summary": summaries[i][0]["summary_text"],
-                "similar_books": similar_books[i]["sorted_by_similarity"],
-                "runtime": runtime,
-            }
         )
     return results

 from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
 # Define the FastAPI app
 app = FastAPI(docs_url="/")
+# Add the CORS middleware to the app
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/search")
+def search(
+    query: str,
+    classification: bool = True,
+    summarization: bool = True,
+    similarity: bool = False,
+    add_chatgpt_results: bool = True,
+    n_results: int = 10,
+):
     import time
     import requests
     start_time = time.time()
     # Initialize the lists to store the results
     titles = []
     authors = []
     descriptions = []
     images = []
+    def gbooks_search(query, n_results=30):
+        """
+        Access the Google Books API and return the results.
+        """
+        # Set the API endpoint and query parameters
+        url = "https://www.googleapis.com/books/v1/volumes"
+        params = {"q": str(query), "printType": "books", "maxResults": n_results}
+        # Send a GET request to the API with the specified parameters
+        response = requests.get(url, params=params)
+        # Parse the response JSON and append the results
+        data = response.json()
+        # Initialize the lists to store the results
+        titles = []
+        authors = []
+        publishers = []
+        descriptions = []
+        images = []
+        for item in data["items"]:
+            volume_info = item["volumeInfo"]
+            try:
+                titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
+            except KeyError:
+                titles.append(volume_info["title"])
+            try:
+                descriptions.append(volume_info["description"])
+            except KeyError:
+                descriptions.append("Null")
+            try:
+                publishers.append(volume_info["publisher"])
+            except KeyError:
+                publishers.append("Null")
+            try:
+                authors.append(volume_info["authors"][0])
+            except KeyError:
+                authors.append("Null")
+            try:
+                images.append(volume_info["imageLinks"]["thumbnail"])
+            except KeyError:
+                images.append(
+                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
+                )
+        return titles, authors, publishers, descriptions, images
+    # Run the gbooks_search function
+    (
+        titles_placeholder,
+        authors_placeholder,
+        publishers_placeholder,
+        descriptions_placeholder,
+        images_placeholder,
+    ) = gbooks_search(query, n_results=n_results)
+    # Append the results to the lists
+    [titles.append(title) for title in titles_placeholder]
+    [authors.append(author) for author in authors_placeholder]
+    [publishers.append(publisher) for publisher in publishers_placeholder]
+    [descriptions.append(description) for description in descriptions_placeholder]
+    [images.append(image) for image in images_placeholder]
+    # Get the time since the start
+    first_checkpoint = time.time()
+    first_checkpoint_time = int(first_checkpoint - start_time)
+    def openalex_search(query, n_results=10):
+        """
+        Run a search on OpenAlex and return the results.
+        """
+        import pyalex
+        from pyalex import Works
+        # Add email to the config
+        pyalex.config.email = "ber2mir@gmail.com"
+        # Define a pager object with the same query
+        pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)
+        # Generate a list of the results
+        openalex_results = list(pager)
+        # Initialize the lists to store the results
+        titles = []
+        authors = []
+        publishers = []
+        descriptions = []
+        images = []
+        # Get the titles, descriptions, and publishers and append them to the lists
+        for result in openalex_results[0]:
+            try:
+                titles.append(result["title"])
+            except KeyError:
+                titles.append("Null")
+            try:
+                descriptions.append(result["abstract"])
+            except KeyError:
+                descriptions.append("Null")
+            try:
+                publishers.append(result["host_venue"]["publisher"])
+            except KeyError:
+                publishers.append("Null")
+            try:
+                authors.append(result["authorships"][0]["author"]["display_name"])
+            except KeyError:
+                authors.append("Null")
             images.append(
                 "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
             )
+            return titles, authors, publishers, descriptions, images
+    # Run the openalex_search function
+    (
+        titles_placeholder,
+        authors_placeholder,
+        publishers_placeholder,
+        descriptions_placeholder,
+        images_placeholder,
+    ) = openalex_search(query, n_results=n_results)
+    # Append the results to the lists
+    [titles.append(title) for title in titles_placeholder]
+    [authors.append(author) for author in authors_placeholder]
+    [publishers.append(publisher) for publisher in publishers_placeholder]
+    [descriptions.append(description) for description in descriptions_placeholder]
+    [images.append(image) for image in images_placeholder]
+    # Calculate the elapsed time between the first and second checkpoints
+    second_checkpoint = time.time()
+    second_checkpoint_time = int(second_checkpoint - first_checkpoint)
+    def openai_search(query, n_results=10):
+        """
+        Create a query to the OpenAI ChatGPT API and return the results.
+        """
+        import openai
+        # Initialize the lists to store the results
+        titles = []
+        authors = []
+        publishers = []
+        descriptions = []
+        images = []
+        # Set the OpenAI API key
+        openai.api_key = "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"
+        # Create ChatGPT query
+        chatgpt_response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a librarian. You are helping a patron find a book.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
+                },
+            ],
+        )
+        # Split the response into a list of results
+        chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
+            "\n"
+        )[2::2]
+        # Define a function to parse the results
+        def parse_result(
+            result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
+        ):
+            # Create a dict to store the key-value pairs
+            parsed_result = {}
+            for key in ordered_keys:
+                # Split the result string by the key and append the value to the list
+                if key != ordered_keys[-1]:
+                    parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
+                else:
+                    parsed_result[key] = result.split(f"{key}: ")[1]
+            return parsed_result
+        ordered_keys = ["Title", "Author", "Publisher", "Summary"]
+        for result in chatgpt_results:
+            try:
+                # Parse the result
+                parsed_result = parse_result(result, ordered_keys=ordered_keys)
+                # Append the parsed result to the lists
+                titles.append(parsed_result["Title"])
+                authors.append(parsed_result["Author"])
+                publishers.append(parsed_result["Publisher"])
+                descriptions.append(parsed_result["Summary"])
+                images.append(
+                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
+                )
+            # In case the OpenAI API hits the limit
+            except IndexError:
+                break
+        return titles, authors, publishers, descriptions, images
+    if add_chatgpt_results:
+        # Run the openai_search function
+        (
+            titles_placeholder,
+            authors_placeholder,
+            publishers_placeholder,
+            descriptions_placeholder,
+            images_placeholder,
+        ) = openai_search(query)
+        # Append the results to the lists
+        [titles.append(title) for title in titles_placeholder]
+        [authors.append(author) for author in authors_placeholder]
+        [publishers.append(publisher) for publisher in publishers_placeholder]
+        [descriptions.append(description) for description in descriptions_placeholder]
+        [images.append(image) for image in images_placeholder]
+    # Calculate the elapsed time between the second and third checkpoints
+    third_checkpoint = time.time()
+    third_checkpoint_time = int(third_checkpoint - second_checkpoint)
     # Combine title, description, and publisher into a single string
     combined_data = [
+        f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
         for title, description, publisher in zip(titles, descriptions, publishers)
     ]
+    def find_similar(combined_data, top_k=10):
+        """
+        Calculate the similarity between the books and return the top_k results.
+        """
+        from sentence_transformers import SentenceTransformer
         from sentence_transformers import util
         sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
             combined_data, convert_to_tensor=True
         )
+        # Make sure that the top_k value is not greater than the number of books
+        top_k = len(combined_data) if top_k > len(combined_data) else top_k
         similar_books = []
+        for i in range(len(combined_data)):
+            # Get the embedding for the ith book
             current_embedding = book_embeddings[i]
+            # Calculate the similarity between the ith book and the rest of the books
             similarity_sorted = util.semantic_search(
+                current_embedding, book_embeddings, top_k=top_k
             )
+            # Append the results to the list
             similar_books.append(
                 {
                     "sorted_by_similarity": similarity_sorted[0][1:],
                 }
             )
+        return similar_books
+    def summarize(descriptions):
+        """
+        Summarize the descriptions and return the results.
+        """
+        from transformers import (
+            AutoTokenizer,
+            AutoModelForSeq2SeqLM,
+            pipeline,
+        )
+        # Define the summarizer model and tokenizer
+        tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+        model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
+        # Create the summarizer pipeline
+        summarizer_pipe = pipeline(
+            "summarization",
+            model=model,
+            tokenizer=tokenizer,
+            min_length=10,
+            max_length=128,
+        )
+        # Summarize the descriptions
+        summaries = [
+            summarizer_pipe(description)
+            if (len(description) > 0)
+            else [{"summary_text": "No summary text is available."}]
+            for description in descriptions
+        ]
+        return summaries
+    def classify(combined_data, parallel=False):
+        """
+        Create classifier pipeline and return the results.
+        """
+        from transformers import (
+            AutoTokenizer,
+            AutoModelForSequenceClassification,
+            pipeline,
+        )
+        # Define the zero-shot classifier
+        tokenizer = AutoTokenizer.from_pretrained(
+            "sileod/deberta-v3-base-tasksource-nli"
         )
+        model = AutoModelForSequenceClassification.from_pretrained(
+            "sileod/deberta-v3-base-tasksource-nli"
+        )
+        classifier_pipe = pipeline(
+            "zero-shot-classification",
+            model=model,
+            tokenizer=tokenizer,
+            hypothesis_template="This book is {}.",
+            batch_size=1,
+            device=-1,
+            multi_label=True,
+        )
+        # Define the candidate labels
+        candidate_labels = [
+            "Introductory",
+            "Advanced",
+            "Academic",
+            "Not Academic",
+            "Manual",
+        ]
+        if parallel:
+            import ray
+            import psutil
+            # Define the number of cores to use
+            num_cores = psutil.cpu_count(logical=True)
+            # Initialize Ray
+            ray.init(num_cpus=num_cores, ignore_reinit_error=True)
+            classifier_id = ray.put(classifier_pipe)
+            # Define the function to be parallelized
+            @ray.remote
+            def classify_parallel(classifier_id, doc, candidate_labels):
+                classifier = ray.get(classifier_id)
+                return classifier(doc, candidate_labels)
+            # Get the predicted labels
+            classes = [
+                classify_parallel.remote(classifier_id, doc, candidate_labels)
+                for doc in combined_data
+            ]
+        else:
+            # Get the predicted labels
+            classes = [classifier_pipe(doc, candidate_labels) for doc in combined_data]
+        return classes
+    # If true then run the similarity, summarize, and classify functions
+    if classification:
+        classes = classify(combined_data, parallel=False)
+    else:
+        classes = [
+            {"labels": ["No labels available."], "scores": [0]}
+            for i in range(len(combined_data))
+        ]
+    # Calculate the elapsed time between the third and fourth checkpoints
+    fourth_checkpoint = time.time()
+    classification_time = int(fourth_checkpoint - third_checkpoint)
+    if summarization:
+        summaries = summarize(descriptions)
+    else:
+        summaries = [
+            [{"summary_text": description}]
+            if (len(description) > 0)
+            else [{"summary_text": "No summary text is available."}]
+            for description in descriptions
+        ]
+    # Calculate the elapsed time between the fourth and fifth checkpoints
+    fifth_checkpoint = time.time()
+    summarization_time = int(fifth_checkpoint - fourth_checkpoint)
+    if similarity:
+        similar_books = find_similar(combined_data)
+    else:
+        similar_books = [
+            {"sorted_by_similarity": ["No similar books available."]}
+            for i in range(len(combined_data))
+        ]
+    # Calculate the elapsed time between the fifth and sixth checkpoints
+    sixth_checkpoint = time.time()
+    similarity_time = int(sixth_checkpoint - fifth_checkpoint)
+    # Calculate the total elapsed time
+    end_time = time.time()
+    runtime = f"{end_time - start_time:.2f} seconds"
+    # Create a list of dictionaries to store the results
+    results = [
+        {
+            "id": i,
+            "title": titles[i],
+            "author": authors[i],
+            "publisher": publishers[i],
+            "image_link": images[i],
+            "labels": classes[i]["labels"][0:2],
+            "label_confidences": classes[i]["scores"][0:2],
+            "summary": summaries[i][0]["summary_text"],
+            "similar_books": similar_books[i]["sorted_by_similarity"],
+            "checkpoints": [
+                {
+                    "Google Books Time": first_checkpoint_time,
+                    "OpenAlex Time": second_checkpoint_time,
+                    "OpenAI Time": third_checkpoint_time,
+                    "Classification Time": classification_time,
+                    "Summarization Time": summarization_time,
+                    "Similarity Computing Time": similarity_time,
+                }
+            ],
+            "total_runtime": runtime,
+        }
+        for i in range(len(combined_data))
+    ]
     return results