Spaces:
Running
Running
feat: structured LLM output + clean content for LLM
Browse filesrag_chat_use_case.py:
- New _clean_content_for_llm(): strips image tags, bare URLs, nav bullets,
horizontal rules from Jina markdown before sending to LLM
- Applied in _limit_context() so all sources are cleaned before tokenizing
- New structured prompt format enforces:
## [Headline]
**[Topic]**
- bullet point [N]
> summary callout
instead of verbose paragraph-per-source style
- Both execute_chat and execute_stream use the same new prompt
- Shorter, cleaner prompt reduces token usage and improves response quality
src/core/use_cases/rag_chat_use_case.py
CHANGED
|
@@ -90,6 +90,8 @@ Document:
|
|
| 90 |
|
| 91 |
for doc in docs:
|
| 92 |
content = doc.get("content", "")
|
|
|
|
|
|
|
| 93 |
metadata = doc.get("metadata", {})
|
| 94 |
|
| 95 |
# Extract source name from multiple possible fields
|
|
@@ -746,7 +748,37 @@ JSON:"""
|
|
| 746 |
# ββ Step 8: Token limitation ββββββββββββββββββββββββββββββββββββββββββ
|
| 747 |
return self._limit_context(query, deduped_final)
|
| 748 |
|
| 749 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
past_messages = self.chat_history_db.get_history(session_id, limit=6)
|
| 751 |
return "".join([f"{msg.role}: {msg.content}\n" for msg in past_messages])
|
| 752 |
|
|
@@ -831,46 +863,38 @@ JSON:"""
|
|
| 831 |
source_label = source_name
|
| 832 |
source_index_lines += f"[{idx}] {source_label}\n"
|
| 833 |
|
| 834 |
-
prompt = f"""You are ARKI AI, a real-time news assistant. Today
|
| 835 |
|
| 836 |
-
|
| 837 |
-
SOURCE INDEX β ONLY THESE SOURCES EXIST. DO NOT INVENT ANY OTHERS.
|
| 838 |
-
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 839 |
{source_index_lines if source_index_lines else "NO SOURCES RETRIEVED."}
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
-
|
| 844 |
-
-
|
| 845 |
-
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
4. Always respond in English.
|
| 869 |
-
5. At the END of your answer, on a new line, write exactly:
|
| 870 |
-
FOLLOW_UP: question1 | question2 | question3
|
| 871 |
-
(3 short follow-up questions based only on what you actually found)
|
| 872 |
-
|
| 873 |
-
News Context (from live multilingual database):
|
| 874 |
{context_text if context_text else "NO CONTEXT RETRIEVED."}
|
| 875 |
|
| 876 |
Conversation History:
|
|
@@ -966,46 +990,38 @@ Answer:"""
|
|
| 966 |
source_index_lines += f"[{idx}] {source_label}\n"
|
| 967 |
doc["citation_index"] = idx
|
| 968 |
|
| 969 |
-
prompt_stream = f"""You are ARKI AI, a real-time news assistant. Today
|
| 970 |
|
| 971 |
-
|
| 972 |
-
SOURCE INDEX β ONLY THESE SOURCES EXIST. DO NOT INVENT ANY OTHERS.
|
| 973 |
-
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 974 |
{source_index_lines if source_index_lines else "NO SOURCES RETRIEVED."}
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
-
|
| 979 |
-
-
|
| 980 |
-
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
4. Always respond in English.
|
| 1004 |
-
5. At the END of your answer, on a new line, write exactly:
|
| 1005 |
-
FOLLOW_UP: question1 | question2 | question3
|
| 1006 |
-
(3 short follow-up questions based only on what you actually found)
|
| 1007 |
-
|
| 1008 |
-
News Context (from live multilingual database):
|
| 1009 |
{context_text if context_text else "NO CONTEXT RETRIEVED."}
|
| 1010 |
|
| 1011 |
Conversation History:
|
|
|
|
| 90 |
|
| 91 |
for doc in docs:
|
| 92 |
content = doc.get("content", "")
|
| 93 |
+
# Clean Jina markdown artifacts before tokenizing/sending to LLM
|
| 94 |
+
content = self._clean_content_for_llm(content)
|
| 95 |
metadata = doc.get("metadata", {})
|
| 96 |
|
| 97 |
# Extract source name from multiple possible fields
|
|
|
|
| 748 |
# ββ Step 8: Token limitation ββββββββββββββββββββββββββββββββββββββββββ
|
| 749 |
return self._limit_context(query, deduped_final)
|
| 750 |
|
| 751 |
+
def _clean_content_for_llm(self, content: str) -> str:
|
| 752 |
+
"""
|
| 753 |
+
Strip markdown artifacts from Jina-extracted content before sending to LLM.
|
| 754 |
+
Removes: image tags, navigation links, skip-to-content, social share buttons.
|
| 755 |
+
Keeps: article text, headings, paragraphs.
|
| 756 |
+
"""
|
| 757 |
+
import re
|
| 758 |
+
# Remove image markdown: 
|
| 759 |
+
content = re.sub(r'!\[.*?\]\(.*?\)', '', content)
|
| 760 |
+
# Remove inline links but keep link text: [text](url) β text
|
| 761 |
+
content = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', content)
|
| 762 |
+
# Remove bare URLs
|
| 763 |
+
content = re.sub(r'https?://\S+', '', content)
|
| 764 |
+
# Remove Skip to content / Skip to main
|
| 765 |
+
content = re.sub(r'\[?Skip to [^\n]+\n?', '', content, flags=re.IGNORECASE)
|
| 766 |
+
# Remove lines that are just navigation items (short lines with *)
|
| 767 |
+
lines = content.split('\n')
|
| 768 |
+
cleaned = []
|
| 769 |
+
for line in lines:
|
| 770 |
+
stripped = line.strip()
|
| 771 |
+
# Skip pure navigation bullets (short, no sentence structure)
|
| 772 |
+
if stripped.startswith('* ') and len(stripped) < 60 and '.' not in stripped:
|
| 773 |
+
continue
|
| 774 |
+
# Skip lines that are just dashes or equals
|
| 775 |
+
if re.match(r'^[-=]{3,}$', stripped):
|
| 776 |
+
continue
|
| 777 |
+
cleaned.append(line)
|
| 778 |
+
content = '\n'.join(cleaned)
|
| 779 |
+
# Collapse multiple blank lines
|
| 780 |
+
content = re.sub(r'\n{3,}', '\n\n', content)
|
| 781 |
+
return content.strip()
|
| 782 |
past_messages = self.chat_history_db.get_history(session_id, limit=6)
|
| 783 |
return "".join([f"{msg.role}: {msg.content}\n" for msg in past_messages])
|
| 784 |
|
|
|
|
| 863 |
source_label = source_name
|
| 864 |
source_index_lines += f"[{idx}] {source_label}\n"
|
| 865 |
|
| 866 |
+
prompt = f"""You are ARKI AI, a real-time Ethiopia & Africa news assistant. Today: {datetime.utcnow().strftime("%B %d, %Y")}.
|
| 867 |
|
| 868 |
+
SOURCE INDEX (cite by number β these are the ONLY sources you may use):
|
|
|
|
|
|
|
| 869 |
{source_index_lines if source_index_lines else "NO SOURCES RETRIEVED."}
|
| 870 |
+
|
| 871 |
+
STRICT RULES:
|
| 872 |
+
- Use ONLY facts from the News Context below. NEVER use training data.
|
| 873 |
+
- Cite every fact: [1], [2], etc. Only use numbers that exist in the Source Index above.
|
| 874 |
+
- Non-English articles: translate to English in your answer.
|
| 875 |
+
- Always respond in English.
|
| 876 |
+
|
| 877 |
+
OUTPUT FORMAT β use this exact structure:
|
| 878 |
+
|
| 879 |
+
## [Short headline summarizing the main news]
|
| 880 |
+
|
| 881 |
+
**[Topic 1]**
|
| 882 |
+
- Key fact from source [N]
|
| 883 |
+
- Key fact from source [N]
|
| 884 |
+
|
| 885 |
+
**[Topic 2]** (if applicable)
|
| 886 |
+
- Key fact from source [N]
|
| 887 |
+
|
| 888 |
+
> π‘ *[One sentence summary of the overall situation]*
|
| 889 |
+
|
| 890 |
+
FOLLOW_UP: question1 | question2 | question3
|
| 891 |
+
|
| 892 |
+
EVALUATION GUIDE:
|
| 893 |
+
- If sources directly answer the question β use the format above
|
| 894 |
+
- If sources are related but not exact β start with "I found related news:" then use the format
|
| 895 |
+
- If no relevant sources β say "I couldn't find relevant news on that topic in today's feed." and STOP
|
| 896 |
+
|
| 897 |
+
News Context:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
{context_text if context_text else "NO CONTEXT RETRIEVED."}
|
| 899 |
|
| 900 |
Conversation History:
|
|
|
|
| 990 |
source_index_lines += f"[{idx}] {source_label}\n"
|
| 991 |
doc["citation_index"] = idx
|
| 992 |
|
| 993 |
+
prompt_stream = f"""You are ARKI AI, a real-time Ethiopia & Africa news assistant. Today: {datetime.utcnow().strftime("%B %d, %Y")}.
|
| 994 |
|
| 995 |
+
SOURCE INDEX (cite by number β these are the ONLY sources you may use):
|
|
|
|
|
|
|
| 996 |
{source_index_lines if source_index_lines else "NO SOURCES RETRIEVED."}
|
| 997 |
+
|
| 998 |
+
STRICT RULES:
|
| 999 |
+
- Use ONLY facts from the News Context below. NEVER use training data.
|
| 1000 |
+
- Cite every fact: [1], [2], etc. Only use numbers that exist in the Source Index above.
|
| 1001 |
+
- Non-English articles: translate to English in your answer.
|
| 1002 |
+
- Always respond in English.
|
| 1003 |
+
|
| 1004 |
+
OUTPUT FORMAT β use this exact structure:
|
| 1005 |
+
|
| 1006 |
+
## [Short headline summarizing the main news]
|
| 1007 |
+
|
| 1008 |
+
**[Topic 1]**
|
| 1009 |
+
- Key fact from source [N]
|
| 1010 |
+
- Key fact from source [N]
|
| 1011 |
+
|
| 1012 |
+
**[Topic 2]** (if applicable)
|
| 1013 |
+
- Key fact from source [N]
|
| 1014 |
+
|
| 1015 |
+
> π‘ *[One sentence summary of the overall situation]*
|
| 1016 |
+
|
| 1017 |
+
FOLLOW_UP: question1 | question2 | question3
|
| 1018 |
+
|
| 1019 |
+
EVALUATION GUIDE:
|
| 1020 |
+
- If sources directly answer the question β use the format above
|
| 1021 |
+
- If sources are related but not exact β start with "I found related news:" then use the format
|
| 1022 |
+
- If no relevant sources β say "I couldn't find relevant news on that topic in today's feed." and STOP
|
| 1023 |
+
|
| 1024 |
+
News Context:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
{context_text if context_text else "NO CONTEXT RETRIEVED."}
|
| 1026 |
|
| 1027 |
Conversation History:
|