from langchain_core.documents import Document
from typing import List
import pandas as pd
def format_docs_xml(docs: List[Document]) -> str:
formatted_docs = [
f"\n{doc.metadata['source']}\n{doc.page_content}\n"
for i, doc in enumerate(docs)
]
return f"\n\n\n{chr(10).join(formatted_docs)}\n"
# def format_docs_xml(docs: List[Document]) -> str:
# """
# Takes a list of Document objects and formats each into XML.
# """
# formatted_docs = []
# for i, doc in enumerate(docs):
# metadata_source = doc.metadata.get("source", "Unknown")
# snippet = doc.page_content or ""
# formatted = (
# f'\n'
# f'{metadata_source}\n'
# f'{snippet}\n'
# f''
# )
# formatted_docs.append(formatted)
# return f"\n\n\n{chr(10).join(formatted_docs)}\n"
def get_article_info(df: pd.DataFrame, file_name: str):
"""
Given a DataFrame and a file name, return the corresponding
title and link from the CSV. Assumes file_name is unique in the DF.
"""
edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
row = df[df["file_name"] == edited_file_name]
if row.empty:
# Fallback if not found
return "IBHS Website", "https://ibhs.org"
return row["title"].iloc[0], row["link"].iloc[0]