import datasets
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever

def load_scientific_paper_dataset():
    # Convert dataset entries into Document objects
    scientific_paper_dataset = datasets.load_dataset("gsasikiran/Summarize-Scientific-Papers-Processed", split="train")
    docs = [
        Document(
            page_content="\n".join([
                f"Title: {scientific_paper['title']}",
                f"Authors: {scientific_paper['authors']}",
                f"What is it: {scientific_paper['article_classification']}",
                f"Claims: {scientific_paper['claims']}",
                f"Contradictions: {scientific_paper['contradictions_and_limitations']}",
                f"Ethical Considerations: {scientific_paper['ethical_considerations']}",
                f"Summary: {scientific_paper['executive_summary']}",
                f"Subfield: {scientific_paper['field_subfield']}",
                f"Theorical Implications: {scientific_paper['interpretation_and_theoretical_implications']}",
                f"Method to Retrieve Info: {scientific_paper['methodological_details']}",
                f"People used to get data: {scientific_paper['procedures_and_architectures']}",
                f"Context of Research: {scientific_paper['research_context']}",
                f"Research Hypothesis: {scientific_paper['research_question_and_hypothesis']}",
                f"Three Takeways: {scientific_paper['three_takeaways']}",
                f"Type of Paper: {scientific_paper['type_of_paper']}"
            ]),
            metadata={"title": scientific_paper["title"]}
        )
        for scientific_paper in scientific_paper_dataset
    ]
    return docs


# --- Scientific Paper Retriever ---
class ScientificPaperRetriever:
    def __init__(self, docs):
        # Build BM25 retriever from documents
        self.retriever = BM25Retriever.from_documents(docs)

    def run(self, query: str) -> str:
        results = self.retriever.retrieve(query)
        if results:
            return "\n\n".join([doc.text for doc in results[:3]])
        else:
            return "No matching scientific paper found."