Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import fitz
|
|
| 12 |
import PyPDF2
|
| 13 |
import gradio
|
| 14 |
import sys
|
|
|
|
| 15 |
from pathlib import Path
|
| 16 |
utils_dir = Path(__file__).parent / 'utils'
|
| 17 |
sys.path.append(str(utils_dir))
|
|
@@ -19,9 +20,13 @@ from openai_utils import *
|
|
| 19 |
import base64
|
| 20 |
from pdf2image import convert_from_bytes
|
| 21 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
|
| 23 |
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
|
| 24 |
-
|
| 25 |
|
| 26 |
def insert_sentence(text, sentence, interval):
|
| 27 |
lines = text.split('\n')
|
|
@@ -44,7 +49,18 @@ def insert_sentence(text, sentence, interval):
|
|
| 44 |
new_lines.append(separator.join(new_words))
|
| 45 |
|
| 46 |
return '\n'.join(new_lines)
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def search_paper(query):
|
| 49 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
|
| 50 |
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
|
|
@@ -57,10 +73,21 @@ def search_paper(query):
|
|
| 57 |
|
| 58 |
return response.json()
|
| 59 |
|
| 60 |
-
def
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
def download_pdf(paper):
|
| 66 |
pdf_url = paper["openAccessPdf"]["url"]
|
|
@@ -70,8 +97,7 @@ def download_pdf(paper):
|
|
| 70 |
|
| 71 |
|
| 72 |
file_object = BytesIO(response.content)
|
| 73 |
-
|
| 74 |
-
chunks = split_text_into_chunks(extract_text)
|
| 75 |
return chunks
|
| 76 |
except:
|
| 77 |
return []
|
|
@@ -79,7 +105,7 @@ def download_pdf(paper):
|
|
| 79 |
|
| 80 |
def recommendation(s2_id, limit=500):
|
| 81 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
|
| 82 |
-
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf"
|
| 83 |
|
| 84 |
# print(url)
|
| 85 |
response = requests.get(url)
|
|
@@ -92,22 +118,20 @@ def recommendation(s2_id, limit=500):
|
|
| 92 |
|
| 93 |
|
| 94 |
def extract_chapter(file_object):
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
break
|
| 110 |
-
return extracted_text
|
| 111 |
|
| 112 |
|
| 113 |
|
|
@@ -138,7 +162,8 @@ class Reviewer:
|
|
| 138 |
for paper in papers:
|
| 139 |
retrieval_content += f"Relevant Paper {str(cnt)}:\n"
|
| 140 |
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
|
| 141 |
-
|
|
|
|
| 142 |
cnt += 1
|
| 143 |
text = retrieval_content + content
|
| 144 |
chat_review_text = self.chat_review(text=text)
|
|
@@ -215,8 +240,8 @@ class Reviewer:
|
|
| 215 |
return rec_papers
|
| 216 |
|
| 217 |
def extract_related_content(self, papers, aspect):
|
| 218 |
-
os.environ["OPENAI_BASE_URL"] =
|
| 219 |
-
os.environ["OPENAI_API_KEY"] =
|
| 220 |
client = AsyncOpenAI()
|
| 221 |
|
| 222 |
messages = []
|
|
@@ -248,7 +273,7 @@ class Reviewer:
|
|
| 248 |
)
|
| 249 |
)
|
| 250 |
|
| 251 |
-
paper_data_list = [{"title": paper["title"], "content": ""} for paper in papers]
|
| 252 |
|
| 253 |
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
|
| 254 |
if response.strip().lower().startswith("yes"):
|
|
@@ -314,7 +339,7 @@ Organize the result in JSON format as follows:
|
|
| 314 |
for paper_data, response in zip(paper_data_list, responses):
|
| 315 |
# print(response)
|
| 316 |
response = json.loads(response)
|
| 317 |
-
results.append({"title": paper_data["title"], "content": response["revised_text"]})
|
| 318 |
return results
|
| 319 |
|
| 320 |
|
|
@@ -372,7 +397,7 @@ Organize the result in JSON format as follows:
|
|
| 372 |
result = ""
|
| 373 |
limit_cnt = 1
|
| 374 |
for limitation in limitations:
|
| 375 |
-
result += f"{str(limit_cnt)}. {limitation}\n"
|
| 376 |
limit_cnt += 1
|
| 377 |
# for choice in response.choices:
|
| 378 |
# result += choice.message.content
|
|
@@ -390,7 +415,7 @@ Organize the result in JSON format as follows:
|
|
| 390 |
query = title
|
| 391 |
search_results = search_paper(query)
|
| 392 |
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
|
| 393 |
-
search_result = search_results[0]
|
| 394 |
retrieval = recommendation(search_result["paperId"])
|
| 395 |
recommended_paper_list = []
|
| 396 |
for recommended_paper in retrieval["recommendedPapers"]:
|
|
@@ -443,7 +468,7 @@ Organize the result in JSON format as follows:
|
|
| 443 |
file_object = BytesIO(pdf_path) # TODO
|
| 444 |
pdf_reader = PyPDF2.PdfReader(file_object)
|
| 445 |
|
| 446 |
-
doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO
|
| 447 |
page = doc.load_page(0)
|
| 448 |
pix = page.get_pixmap()
|
| 449 |
image_bytes = pix.tobytes("png")
|
|
@@ -470,21 +495,21 @@ Organize the result in JSON format as follows:
|
|
| 470 |
title = response["title"]
|
| 471 |
abstract = response["abstract"]
|
| 472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
-
num_pages = len(pdf_reader.pages)
|
| 476 |
-
extraction_started = False
|
| 477 |
-
extracted_text = ""
|
| 478 |
-
for page_number in range(num_pages):
|
| 479 |
-
page = pdf_reader.pages[page_number]
|
| 480 |
-
page_text = page.extract_text()
|
| 481 |
-
|
| 482 |
-
extraction_started = True
|
| 483 |
-
page_number_start = page_number
|
| 484 |
-
if extraction_started:
|
| 485 |
-
extracted_text += page_text
|
| 486 |
-
if page_number_start + 1 < page_number:
|
| 487 |
-
break
|
| 488 |
return extracted_text, title, abstract
|
| 489 |
|
| 490 |
def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
|
@@ -511,9 +536,6 @@ def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
|
| 511 |
return retrieved_content, comments, output2
|
| 512 |
|
| 513 |
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
########################################################################################################
|
| 518 |
|
| 519 |
title = "LimitGen"
|
|
|
|
| 12 |
import PyPDF2
|
| 13 |
import gradio
|
| 14 |
import sys
|
| 15 |
+
from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk, OCRResponse
|
| 16 |
from pathlib import Path
|
| 17 |
utils_dir = Path(__file__).parent / 'utils'
|
| 18 |
sys.path.append(str(utils_dir))
|
|
|
|
| 20 |
import base64
|
| 21 |
from pdf2image import convert_from_bytes
|
| 22 |
import requests
|
| 23 |
+
import bibtexparser
|
| 24 |
+
from pybtex.database import parse_string
|
| 25 |
+
from pybtex.plugin import find_plugin
|
| 26 |
+
|
| 27 |
PRIVATE_API_KEY = os.getenv('PRIVATE_API_KEY')
|
| 28 |
PRIVATE_API_BASE = os.getenv('PRIVATE_API_BASE')
|
| 29 |
+
MISTRAL_API = os.getenv('MISTRAL_API')
|
| 30 |
|
| 31 |
def insert_sentence(text, sentence, interval):
|
| 32 |
lines = text.split('\n')
|
|
|
|
| 49 |
new_lines.append(separator.join(new_words))
|
| 50 |
|
| 51 |
return '\n'.join(new_lines)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def format_bibtex(paper, style='apa'):
|
| 55 |
+
bibtex_entry = paper["citationStyles"]["bibtex"]
|
| 56 |
+
bib_data = parse_string(bibtex_entry, 'bibtex')
|
| 57 |
+
formatter = find_plugin('pybtex.style.formatting', style)()
|
| 58 |
+
entries = list(bib_data.entries.values())
|
| 59 |
+
if not entries:
|
| 60 |
+
return "No valid entries found."
|
| 61 |
+
formatted_entry = formatter.format_entries(entries)
|
| 62 |
+
return '\n'.join(entry.text.render_as('text') for entry in formatted_entry)
|
| 63 |
+
|
| 64 |
def search_paper(query):
|
| 65 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/graph/v1/paper/"
|
| 66 |
url = f"{SEMANTIC_SCHOLAR_API_URL}search?query={query}&limit=3&fields=url,title,abstract&fieldsOfStudy=Computer Science"
|
|
|
|
| 73 |
|
| 74 |
return response.json()
|
| 75 |
|
| 76 |
+
def get_combined_markdown(pdf_response: OCRResponse) -> str:
|
| 77 |
+
markdowns: list[str] = []
|
| 78 |
+
for page in pdf_response.pages:
|
| 79 |
+
markdowns.append(page.markdown)
|
| 80 |
+
|
| 81 |
+
return "\n\n".join(markdowns)
|
| 82 |
+
|
| 83 |
+
def split_text_into_chunks(pdf_response: OCRResponse) -> str:
|
| 84 |
+
# words = text.split()
|
| 85 |
+
# chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 86 |
+
# return chunks
|
| 87 |
+
markdowns: list[str] = []
|
| 88 |
+
for page in pdf_response.pages:
|
| 89 |
+
markdowns.append(page.markdown)
|
| 90 |
+
return markdowns
|
| 91 |
|
| 92 |
def download_pdf(paper):
|
| 93 |
pdf_url = paper["openAccessPdf"]["url"]
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
file_object = BytesIO(response.content)
|
| 100 |
+
chunks = extract_chapter(file_object)
|
|
|
|
| 101 |
return chunks
|
| 102 |
except:
|
| 103 |
return []
|
|
|
|
| 105 |
|
| 106 |
def recommendation(s2_id, limit=500):
|
| 107 |
SEMANTIC_SCHOLAR_API_URL = "https://api.semanticscholar.org/recommendations/v1/papers/forpaper/"
|
| 108 |
+
url = f"{SEMANTIC_SCHOLAR_API_URL}{s2_id}?limit={limit}&fields=url,title,abstract,publicationDate,isOpenAccess,openAccessPdf,citationStyles"
|
| 109 |
|
| 110 |
# print(url)
|
| 111 |
response = requests.get(url)
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def extract_chapter(file_object):
|
| 121 |
+
client = Mistral(api_key=MISTRAL_API)
|
| 122 |
+
uploaded_file = client.files.upload(
|
| 123 |
+
file={
|
| 124 |
+
"file_name": "retrieve.pdf",
|
| 125 |
+
"content": file_object.read(),
|
| 126 |
+
},
|
| 127 |
+
purpose="ocr",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
| 131 |
+
pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
|
| 132 |
+
# response_dict = json.loads(pdf_response.json())
|
| 133 |
+
chunks = split_text_into_chunks(pdf_response)
|
| 134 |
+
return chunks
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
|
|
|
|
| 162 |
for paper in papers:
|
| 163 |
retrieval_content += f"Relevant Paper {str(cnt)}:\n"
|
| 164 |
retrieval_content += f"Title: {paper['title']}\n{paper['content']}\n\n"
|
| 165 |
+
formatted_citation = format_bibtex(paper, 'unsrt')
|
| 166 |
+
retrieved_papers += f"{str(cnt)}. {formatted_citation} ({paper['url']})\n\n"
|
| 167 |
cnt += 1
|
| 168 |
text = retrieval_content + content
|
| 169 |
chat_review_text = self.chat_review(text=text)
|
|
|
|
| 240 |
return rec_papers
|
| 241 |
|
| 242 |
def extract_related_content(self, papers, aspect):
|
| 243 |
+
os.environ["OPENAI_BASE_URL"] = PRIVATE_API_BASE
|
| 244 |
+
os.environ["OPENAI_API_KEY"] = PRIVATE_API_KEY
|
| 245 |
client = AsyncOpenAI()
|
| 246 |
|
| 247 |
messages = []
|
|
|
|
| 273 |
)
|
| 274 |
)
|
| 275 |
|
| 276 |
+
paper_data_list = [{"title": paper["title"], "content": "", "citationStyles": paper["citationStyles"], "url": paper["url"]} for paper in papers]
|
| 277 |
|
| 278 |
for (paper_idx, chunk_idx), response in zip(chunk_index_map, responses):
|
| 279 |
if response.strip().lower().startswith("yes"):
|
|
|
|
| 339 |
for paper_data, response in zip(paper_data_list, responses):
|
| 340 |
# print(response)
|
| 341 |
response = json.loads(response)
|
| 342 |
+
results.append({"title": paper_data["title"], "content": response["revised_text"], "citationStyles": paper_data["citationStyles"], "url": paper_data["url"]})
|
| 343 |
return results
|
| 344 |
|
| 345 |
|
|
|
|
| 397 |
result = ""
|
| 398 |
limit_cnt = 1
|
| 399 |
for limitation in limitations:
|
| 400 |
+
result += f"{str(limit_cnt)}. {limitation}\n\n"
|
| 401 |
limit_cnt += 1
|
| 402 |
# for choice in response.choices:
|
| 403 |
# result += choice.message.content
|
|
|
|
| 415 |
query = title
|
| 416 |
search_results = search_paper(query)
|
| 417 |
if search_results != [] and search_results["data"][0]["title"].lower() == title.lower():
|
| 418 |
+
search_result = search_results["data"][0]
|
| 419 |
retrieval = recommendation(search_result["paperId"])
|
| 420 |
recommended_paper_list = []
|
| 421 |
for recommended_paper in retrieval["recommendedPapers"]:
|
|
|
|
| 468 |
file_object = BytesIO(pdf_path) # TODO
|
| 469 |
pdf_reader = PyPDF2.PdfReader(file_object)
|
| 470 |
|
| 471 |
+
doc = fitz.open(stream=pdf_path, filetype="pdf") # TODO path/bytes
|
| 472 |
page = doc.load_page(0)
|
| 473 |
pix = page.get_pixmap()
|
| 474 |
image_bytes = pix.tobytes("png")
|
|
|
|
| 495 |
title = response["title"]
|
| 496 |
abstract = response["abstract"]
|
| 497 |
|
| 498 |
+
client = Mistral(api_key=MISTRAL_API)
|
| 499 |
+
file_object.seek(0)
|
| 500 |
+
uploaded_file = client.files.upload(
|
| 501 |
+
file={
|
| 502 |
+
"file_name": "upload.pdf",
|
| 503 |
+
"content": file_object.read(),
|
| 504 |
+
},
|
| 505 |
+
purpose="ocr",
|
| 506 |
+
)
|
| 507 |
|
| 508 |
+
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
| 509 |
+
pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)
|
| 510 |
+
# response_dict = json.loads(pdf_response.json())
|
| 511 |
+
extracted_text = get_combined_markdown(pdf_response)
|
| 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
return extracted_text, title, abstract
|
| 514 |
|
| 515 |
def main(api,api_base, paper_pdf, aspect, model_name, limit_num, enable_rag):
|
|
|
|
| 536 |
return retrieved_content, comments, output2
|
| 537 |
|
| 538 |
|
|
|
|
|
|
|
|
|
|
| 539 |
########################################################################################################
|
| 540 |
|
| 541 |
title = "LimitGen"
|