Spaces:
Running
on
Zero
Running
on
Zero
| import re | |
| import fitz # pip install pymupdf | |
| from unidecode import unidecode | |
| from nltk.tokenize import sent_tokenize | |
| import bm25s | |
| def retrieve_with_bm25s(pdf_file, claim, top_k=10): | |
| # Get PDF file as binary | |
| with open(pdf_file, mode="rb") as f: | |
| pdf_file_bytes = f.read() | |
| # Extract text from the PDF | |
| pdf_doc = fitz.open(stream=pdf_file_bytes, filetype="pdf") | |
| pdf_text = "" | |
| for page_num in range(pdf_doc.page_count): | |
| page = pdf_doc.load_page(page_num) | |
| pdf_text += page.get_text("text") | |
| # Clean text | |
| # pdf_text = 'In §3.1, we find\nthat dis-\ntractor abstracts.') | |
| # clean_text = 'In SS3.1, we find that distractor abstracts.' | |
| # Remove hyphens at end of lines | |
| clean_text = re.sub("-\n", "", pdf_text) | |
| # Replace remaining newline characters with space | |
| clean_text = re.sub("\n", " ", clean_text) | |
| # Replace unicode with ascii | |
| clean_text = unidecode(clean_text) | |
| # Parse text into sentences to build the corpus | |
| corpus = sent_tokenize(clean_text) | |
| # Tokenize the corpus | |
| corpus_tokens = bm25s.tokenize(corpus, stopwords="en") | |
| # Initialize the BM25 model | |
| retriever = bm25s.BM25() | |
| retriever.index(corpus_tokens, show_progress=False) | |
| # Tokenize the claim | |
| query_tokens = bm25s.tokenize(claim) | |
| # Get top k results | |
| # Use int(k) in case we get str value (as in retrieval example) | |
| results, scores = retriever.retrieve(query_tokens, corpus=corpus, k=int(top_k)) | |
| ## Print results | |
| # for i in range(results.shape[1]): | |
| # doc, score = results[0, i], scores[0, i] | |
| # print(f"Rank {i+1} (score: {score:.2f}): {doc}") | |
| # Join sentences and return results | |
| results = " ".join(results[0]) | |
| return results | |