import datasets from langchain_core.documents import Document from langchain_community.retrievers import BM25Retriever def load_scientific_paper_dataset(): # Convert dataset entries into Document objects scientific_paper_dataset = datasets.load_dataset("gsasikiran/Summarize-Scientific-Papers-Processed", split="train") docs = [ Document( page_content="\n".join([ f"Title: {scientific_paper['title']}", f"Authors: {scientific_paper['authors']}", f"What is it: {scientific_paper['article_classification']}", f"Claims: {scientific_paper['claims']}", f"Contradictions: {scientific_paper['contradictions_and_limitations']}", f"Ethical Considerations: {scientific_paper['ethical_considerations']}", f"Summary: {scientific_paper['executive_summary']}", f"Subfield: {scientific_paper['field_subfield']}", f"Theorical Implications: {scientific_paper['interpretation_and_theoretical_implications']}", f"Method to Retrieve Info: {scientific_paper['methodological_details']}", f"People used to get data: {scientific_paper['procedures_and_architectures']}", f"Context of Research: {scientific_paper['research_context']}", f"Research Hypothesis: {scientific_paper['research_question_and_hypothesis']}", f"Three Takeways: {scientific_paper['three_takeaways']}", f"Type of Paper: {scientific_paper['type_of_paper']}" ]), metadata={"title": scientific_paper["title"]} ) for scientific_paper in scientific_paper_dataset ] return docs # --- Scientific Paper Retriever --- class ScientificPaperRetriever: def __init__(self, docs): # Build BM25 retriever from documents self.retriever = BM25Retriever.from_documents(docs) def run(self, query: str) -> str: results = self.retriever.retrieve(query) if results: return "\n\n".join([doc.text for doc in results[:3]]) else: return "No matching scientific paper found."