File size: 1,581 Bytes
04aa1c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa8ee23
 
04aa1c8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from langchain_core.documents import Document
from typing import List
import pandas as pd

def format_docs_xml(docs: List[Document]) -> str:
    formatted_docs = [
        f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
        for i, doc in enumerate(docs)
    ]
    return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"

# def format_docs_xml(docs: List[Document]) -> str:
#     """
#     Takes a list of Document objects and formats each into XML.
#     """
#     formatted_docs = []
#     for i, doc in enumerate(docs):
#         metadata_source = doc.metadata.get("source", "Unknown")
#         snippet = doc.page_content or ""
#         formatted = (
#             f'<source id="{i}">\n'
#             f'<source>{metadata_source}</source>\n'
#             f'<article_snippet>{snippet}</article_snippet>\n'
#             f'</source>'
#         )
#         formatted_docs.append(formatted)
#     return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"


def get_article_info(df: pd.DataFrame, file_name: str):
    """
    Given a DataFrame and a file name, return the corresponding
    title and link from the CSV. Assumes file_name is unique in the DF.
    """
    edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
    row = df[df["file_name"] == edited_file_name]
    if row.empty:
        # Fallback if not found
        return "IBHS Website", "https://ibhs.org"
    return row["title"].iloc[0], row["link"].iloc[0]