Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -47,6 +47,49 @@ prompt = ChatPromptTemplate.from_messages([
|
|
| 47 |
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
|
| 48 |
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def format_latex_for_gradio(text):
|
| 51 |
"""Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
|
| 52 |
# Convert display math \[ ... \] to $$ ... $$
|
|
@@ -68,10 +111,15 @@ def chat_function(message, history):
|
|
| 68 |
# Extract sources from retrieved documents
|
| 69 |
source_documents = response.get("context", [])
|
| 70 |
unique_sources = []
|
|
|
|
|
|
|
| 71 |
for doc in source_documents:
|
| 72 |
source = doc.metadata.get('source', None)
|
| 73 |
-
if source and source not in
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# Remove ANY existing "Sources:" section that the LLM generated
|
| 77 |
if "Sources:" in final_answer:
|
|
|
|
| 47 |
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
|
| 48 |
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
|
| 49 |
|
| 50 |
+
def format_citation(filename):
|
| 51 |
+
"""
|
| 52 |
+
Convert filename format 'Author_Year - Title' to 'Author (Year) – Title'
|
| 53 |
+
Handles 'et al.' and '&' in author names properly
|
| 54 |
+
|
| 55 |
+
Examples:
|
| 56 |
+
'Ng et al._2023 - Problems and Solutions' -> 'Ng et al. (2023) – Problems and Solutions'
|
| 57 |
+
'Godley & Xia_2016 - Physics Guide' -> 'Godley & Xia (2016) – Physics Guide'
|
| 58 |
+
'Khan (2003) - Therapy' -> 'Khan (2003) – Therapy'
|
| 59 |
+
"""
|
| 60 |
+
# Remove file extension if present
|
| 61 |
+
filename = re.sub(r'\.(pdf|txt)$', '', filename, flags=re.IGNORECASE)
|
| 62 |
+
|
| 63 |
+
# Pattern 1: Handle "Author_YEAR - Title" format
|
| 64 |
+
match = re.match(r'^(.+?)_(\d{4})\s*-\s*(.+)$', filename)
|
| 65 |
+
if match:
|
| 66 |
+
author = match.group(1)
|
| 67 |
+
year = match.group(2)
|
| 68 |
+
title = match.group(3)
|
| 69 |
+
|
| 70 |
+
# Replace underscores in author names with spaces
|
| 71 |
+
author = author.replace('_', ' ')
|
| 72 |
+
|
| 73 |
+
# Format: Author (Year) – Title
|
| 74 |
+
return f"{author} ({year}) – {title}"
|
| 75 |
+
|
| 76 |
+
# Pattern 2: Already has parentheses "Author (YEAR) - Title"
|
| 77 |
+
match = re.match(r'^(.+?)\s*\((\d{4})\)\s*-\s*(.+)$', filename)
|
| 78 |
+
if match:
|
| 79 |
+
author = match.group(1)
|
| 80 |
+
year = match.group(2)
|
| 81 |
+
title = match.group(3)
|
| 82 |
+
|
| 83 |
+
# Replace underscores and hyphens with proper formatting
|
| 84 |
+
author = author.replace('_', ' ')
|
| 85 |
+
return f"{author} ({year}) – {title}"
|
| 86 |
+
|
| 87 |
+
# Pattern 3: Just clean up underscores and hyphens in any format
|
| 88 |
+
filename = filename.replace('_', ' ')
|
| 89 |
+
filename = re.sub(r'\s*-\s*', ' – ', filename)
|
| 90 |
+
|
| 91 |
+
return filename
|
| 92 |
+
|
| 93 |
def format_latex_for_gradio(text):
|
| 94 |
"""Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
|
| 95 |
# Convert display math \[ ... \] to $$ ... $$
|
|
|
|
| 111 |
# Extract sources from retrieved documents
|
| 112 |
source_documents = response.get("context", [])
|
| 113 |
unique_sources = []
|
| 114 |
+
seen_sources = set()
|
| 115 |
+
|
| 116 |
for doc in source_documents:
|
| 117 |
source = doc.metadata.get('source', None)
|
| 118 |
+
if source and source not in seen_sources:
|
| 119 |
+
# Format the citation properly
|
| 120 |
+
formatted_source = format_citation(source)
|
| 121 |
+
unique_sources.append(formatted_source)
|
| 122 |
+
seen_sources.add(source)
|
| 123 |
|
| 124 |
# Remove ANY existing "Sources:" section that the LLM generated
|
| 125 |
if "Sources:" in final_answer:
|