Spaces:
Sleeping
Sleeping
File size: 2,191 Bytes
5dde853 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import datasets
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
def load_scientific_paper_dataset():
# Convert dataset entries into Document objects
scientific_paper_dataset = datasets.load_dataset("gsasikiran/Summarize-Scientific-Papers-Processed", split="train")
docs = [
Document(
page_content="\n".join([
f"Title: {scientific_paper['title']}",
f"Authors: {scientific_paper['authors']}",
f"What is it: {scientific_paper['article_classification']}",
f"Claims: {scientific_paper['claims']}",
f"Contradictions: {scientific_paper['contradictions_and_limitations']}",
f"Ethical Considerations: {scientific_paper['ethical_considerations']}",
f"Summary: {scientific_paper['executive_summary']}",
f"Subfield: {scientific_paper['field_subfield']}",
f"Theorical Implications: {scientific_paper['interpretation_and_theoretical_implications']}",
f"Method to Retrieve Info: {scientific_paper['methodological_details']}",
f"People used to get data: {scientific_paper['procedures_and_architectures']}",
f"Context of Research: {scientific_paper['research_context']}",
f"Research Hypothesis: {scientific_paper['research_question_and_hypothesis']}",
f"Three Takeways: {scientific_paper['three_takeaways']}",
f"Type of Paper: {scientific_paper['type_of_paper']}"
]),
metadata={"title": scientific_paper["title"]}
)
for scientific_paper in scientific_paper_dataset
]
return docs
# --- Scientific Paper Retriever ---
class ScientificPaperRetriever:
def __init__(self, docs):
# Build BM25 retriever from documents
self.retriever = BM25Retriever.from_documents(docs)
def run(self, query: str) -> str:
results = self.retriever.retrieve(query)
if results:
return "\n\n".join([doc.text for doc in results[:3]])
else:
return "No matching scientific paper found."
|