Spaces:
Sleeping
Sleeping
| from langchain_core.documents import Document | |
| from typing import List | |
| import pandas as pd | |
| def format_docs_xml(docs: List[Document]) -> str: | |
| formatted_docs = [ | |
| f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>" | |
| for i, doc in enumerate(docs) | |
| ] | |
| return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>" | |
| # def format_docs_xml(docs: List[Document]) -> str: | |
| # """ | |
| # Takes a list of Document objects and formats each into XML. | |
| # """ | |
| # formatted_docs = [] | |
| # for i, doc in enumerate(docs): | |
| # metadata_source = doc.metadata.get("source", "Unknown") | |
| # snippet = doc.page_content or "" | |
| # formatted = ( | |
| # f'<source id="{i}">\n' | |
| # f'<source>{metadata_source}</source>\n' | |
| # f'<article_snippet>{snippet}</article_snippet>\n' | |
| # f'</source>' | |
| # ) | |
| # formatted_docs.append(formatted) | |
| # return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>" | |
| def get_article_info(df: pd.DataFrame, file_name: str): | |
| """ | |
| Given a DataFrame and a file name, return the corresponding | |
| title and link from the CSV. Assumes file_name is unique in the DF. | |
| """ | |
| edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "") | |
| row = df[df["file_name"] == edited_file_name] | |
| if row.empty: | |
| # Fallback if not found | |
| return "IBHS Website", "https://ibhs.org" | |
| return row["title"].iloc[0], row["link"].iloc[0] |