Spaces:

NealCaren
/

writer-app

Build error

App Files Files Community

writer-app / app.py

NealCaren

Update app.py

22bc47a verified about 1 year ago

raw

history blame contribute delete

8.69 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import anthropic
	import json
	import re
	import os
	from pathlib import Path

	# Initialize the SentenceTransformer model
	model = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)

	def load_json_prompts(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	return json.load(file)

	def get_embeddings(chunk):
	"""Get embedding for chunk using SentenceTransformer"""
	embedding = model.encode(chunk)
	return embedding

	def search_similar(query_embedding, chunk_df, k=6, max_per_citation=2):
	"""Search chunks using cosine similarity with citation limit"""
	similarities = []
	for idx, row in chunk_df.iterrows():
	sim = np.dot(query_embedding, row['embeddings']) / (
	np.linalg.norm(query_embedding) * np.linalg.norm(row['embeddings'])
	)
	similarities.append({
	'citation_id': f"[{idx + 1}]",
	'citation': row['citation'],
	'text': row['text_chunk'],
	'chunk_label': row['chunk_label'],
	'similarity': sim
	})

	similarities = sorted(similarities, key=lambda x: x['similarity'], reverse=True)
	citations_count = {}
	filtered_results = []

	for result in similarities:
	citation = result['citation']
	citations_count[citation] = citations_count.get(citation, 0)

	if citations_count[citation] < max_per_citation:
	citations_count[citation] += 1
	filtered_results.append({
	'citation': result['citation'],
	'text': result['text'],
	'chunk_label': result['chunk_label']
	})

	if len(filtered_results) == k:
	break

	return filtered_results

	def naive_search(thesis, context, client):
	message = client.messages.create(
	model="claude-3-sonnet-20240229",
	max_tokens=3000,
	temperature=0,
	system=naive_system_prompt,
	messages=[{
	"role": "user",
	"content": [{
	"type": "text",
	"text": f"Here are the references you should use:<references>\n{context}\n</references>\n\nHere is the topic you need to write about:\n<topic>\n{thesis}\n</topic>\nReturn only the text without preface."
	}]
	}]
	)
	return message.content[0].text

	def section_drafter(thesis, context, style_notes, client):
	if len(style_notes) > 0:
	style_notes = f'''Here's some important style guidelines::\n\n{style_notes}'''

	message = client.messages.create(
	model="claude-3-5-sonnet-20241022",
	max_tokens=5000,
	temperature=0,
	system=section_draft_prompt,
	messages=[{
	"role": "user",
	"content": [{
	"type": "text",
	"text": f"Write a part of this literature review. {style_notes} Here are the references you should use:<references>\n{context}\n</references>\n\nWe are currently working on this section:\n<prompt>\n{thesis}\n</prompt>\nReturn only the <text> prefaced only with an approriate markdown subheader for the specific section. The text should be comprehensive and detailed, being sure to cite existing work and the work it enganges with."
	}]
	}]
	)
	return message.content[0].text

	def format_asa_citation(bibtex_string):
	"""Converts a BibTeX string to an ASA style citation."""
	author_match = re.search(r"author\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	title_match = re.search(r"title\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	year_match = re.search(r"year\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	journal_match = re.search(r"journal\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	volume_match = re.search(r"volume\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	number_match = re.search(r"number\s=\s{(.*?)}", bibtex_string, re.DOTALL)
	pages_match = re.search(r"pages\s=\s{(.*?)}", bibtex_string, re.DOTALL)

	author = author_match.group(1).strip() if author_match else ""
	title = title_match.group(1).strip() if title_match else ""
	year = year_match.group(1).strip() if year_match else ""
	journal = journal_match.group(1).strip() if journal_match else ""
	volume = volume_match.group(1).strip() if volume_match else ""
	number = number_match.group(1).strip() if number_match else ""
	pages = pages_match.group(1).strip() if pages_match else ""

	citation = f"{author}. {year}. {title}. {journal} {volume}({number}): {pages}."
	return citation

	def extract_cites(context):
	cites = [format_asa_citation(item['citation']) for item in context]
	cites = list(set(cites))
	return '\n'.join([f'* {cite}' for cite in cites])

	def generate_literature_review(thesis, style_notes, api_key, progress=gr.Progress()):

	yield gr.update(value="")

	output = []

	# Check if using the special password "quote" to load from environment
	if api_key == "quote":
	api_key = os.getenv("ANTHROPIC_API_KEY")
	if not api_key:
	raise ValueError("Environment variable ANTHROPIC_API_KEY not found")

	# Initialize Anthropic client
	client = anthropic.Anthropic(api_key=api_key)

	# Load data
	progress(0.1, desc="Loading document chunks...")


	# First attempt
	progress(0.2, desc="Finding initial references...")
	context = search_similar(get_embeddings(thesis), chunk_df, k=8)
	first_cites = extract_cites(context)

	progress(0.4, desc="Generating first draft...")
	naive_results = naive_search(thesis, context, client)

	# Second attempt
	progress(0.6, desc="Finding additional references...")
	context2 = search_similar(get_embeddings(naive_results), chunk_df, k=16)
	text_chunks = [c['text'] for c in context]
	combo_context = context + [c for c in context2 if c['text'] not in text_chunks]
	final_cites = extract_cites(combo_context)
	ref_count = len(final_cites.split('\n'))

	progress(0.8, desc=f"Generating final draft from {ref_count} sources...")
	draft = section_drafter(thesis, combo_context, style_notes, client)
	output.append(draft)
	output.append("\n## Sources\n" + final_cites)
	progress(1.0, desc="Complete!")
	yield "\n\n".join(output)

	def create_interface():
	theme = gr.themes.Soft(
	primary_hue="slate",
	)

	with gr.Blocks(theme=theme) as app:
	gr.Markdown("# CiteCraft")
	intro_text = f"Using {chunk_count} pages from {source_count} sources."
	gr.Markdown(intro_text)

	with gr.Row():
	with gr.Column(scale=1):
	thesis_input = gr.Textbox(
	label="Section Topic",
	placeholder="Enter your research question or section theme here",
	lines=4
	)
	style_notes_input = gr.Textbox(
	label="Style notes (Optional)",
	placeholder="Enter any writing style modifications (optional)",
	lines=4
	)
	api_key = gr.Textbox(
	label="Anthropic API Key",
	placeholder="Enter your Anthropic API key",
	type="password"
	)
	generate_button = gr.Button("Generate Review", variant="primary")

	with gr.Column(scale=2):
	output_text = gr.Markdown(label="Generated Review")

	generate_button.click(
	generate_literature_review,
	inputs=[thesis_input, style_notes_input, api_key],
	outputs=output_text
	)

	return app

	if __name__ == "__main__":
	# Load necessary prompts and configurations
	try:
	json_prompts = load_json_prompts('prompts.json')
	print("Successfully loaded prompts.json")

	# Get system prompts and protest file name
	section_draft_prompt = json_prompts['section_draft_prompt']
	naive_system_prompt = json_prompts['naive_system_prompt']
	protest_file_name = json_prompts['protest_file_name']

	print(f"Will load document chunks from: {protest_file_name}")
	chunk_df = pd.read_json(protest_file_name)
	chunk_count = len(chunk_df)
	source_count = len(chunk_df['citation'].value_counts())

	except Exception as e:
	print(f"Error loading prompts.json: {e}")
	raise

	# Launch the app
	app = create_interface()
	app.launch(share=True) # share=True creates a public URL