RoofingRoadmap / helpers.py
IBHS's picture
Upload 13 files
fa8ee23 verified
raw
history blame
1.58 kB
from langchain_core.documents import Document
from typing import List
import pandas as pd
def format_docs_xml(docs: List[Document]) -> str:
formatted_docs = [
f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
for i, doc in enumerate(docs)
]
return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
# def format_docs_xml(docs: List[Document]) -> str:
# """
# Takes a list of Document objects and formats each into XML.
# """
# formatted_docs = []
# for i, doc in enumerate(docs):
# metadata_source = doc.metadata.get("source", "Unknown")
# snippet = doc.page_content or ""
# formatted = (
# f'<source id="{i}">\n'
# f'<source>{metadata_source}</source>\n'
# f'<article_snippet>{snippet}</article_snippet>\n'
# f'</source>'
# )
# formatted_docs.append(formatted)
# return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
def get_article_info(df: pd.DataFrame, file_name: str):
"""
Given a DataFrame and a file name, return the corresponding
title and link from the CSV. Assumes file_name is unique in the DF.
"""
edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
row = df[df["file_name"] == edited_file_name]
if row.empty:
# Fallback if not found
return "IBHS Website", "https://ibhs.org"
return row["title"].iloc[0], row["link"].iloc[0]