DavidFernandes commited on
Commit
f17a4c9
·
verified ·
1 Parent(s): 5500bfb

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +1 -389
utils.py CHANGED
@@ -1,4 +1,3 @@
1
- <<<<<<< HEAD
2
  import streamlit as st
3
  from groq import Groq
4
  import io
@@ -373,391 +372,4 @@ def delete_from_history(doc_name):
373
 
374
  def format_timestamp(timestamp):
375
  """Format timestamp for display"""
376
- return timestamp.strftime("%Y-%m-%d %H:%M:%S")
377
- =======
378
- # utils.py
379
- import streamlit as st
380
- from groq import Groq
381
- import io
382
- import base64
383
- import re
384
- import os
385
- from dotenv import load_dotenv
386
- from llama_index.core import VectorStoreIndex, Settings, Document
387
- from llama_index.readers.file import PDFReader
388
- from llama_index.llms.groq import Groq as LlamaGroq
389
- from llama_index.embeddings.langchain import LangchainEmbedding
390
- from langchain_community.embeddings import HuggingFaceEmbeddings
391
- from datetime import datetime
392
- from PIL import Image
393
-
394
- # Load environment variables and configure
395
- load_dotenv()
396
- groq_api_key = os.getenv("GROQ_API_KEY")
397
- client = Groq(api_key=groq_api_key)
398
-
399
- # Configure LlamaIndex
400
- Settings.llm = LlamaGroq(api_key=groq_api_key, model="llama-3.1-70b-versatile")
401
- lc_embed_model = HuggingFaceEmbeddings(
402
- model_name="sentence-transformers/all-mpnet-base-v2"
403
- )
404
- Settings.embed_model = LangchainEmbedding(lc_embed_model)
405
-
406
- def initialize_session_state():
407
- """Initialize all session state variables"""
408
- if 'chat_engines' not in st.session_state:
409
- st.session_state.chat_engines = {}
410
- if 'analyses' not in st.session_state:
411
- st.session_state.analyses = {}
412
- if 'documents' not in st.session_state:
413
- st.session_state.documents = {}
414
- if 'current_doc' not in st.session_state:
415
- st.session_state.current_doc = None
416
- if 'messages' not in st.session_state:
417
- st.session_state.messages = []
418
- if 'document_history' not in st.session_state:
419
- st.session_state.document_history = {}
420
-
421
- def encode_image_to_base64(image):
422
- """Convert PIL Image to base64 string"""
423
- buffered = io.BytesIO()
424
- image.save(buffered, format="JPEG")
425
- return base64.b64encode(buffered.getvalue()).decode()
426
-
427
- def process_image(image):
428
- """Process image using Llama vision model"""
429
- img_base64 = encode_image_to_base64(image)
430
- img_url = f"data:image/jpeg;base64,{img_base64}"
431
-
432
- completion = client.chat.completions.create(
433
- model="llama-3.2-11b-vision-preview",
434
- messages=[
435
- {
436
- "role": "user",
437
- "content": [
438
- {
439
- "type": "text",
440
- "text": """Please analyze this government document and provide:
441
- 1. Document type and purpose
442
- 2. Key requirements and deadlines
443
- 3. Complex terms explained simply
444
- 4. Required actions or next steps
445
- 5. Important contact information or submission details"""
446
- },
447
- {
448
- "type": "image_url",
449
- "image_url": {
450
- "url": img_url
451
- }
452
- }
453
- ]
454
- }
455
- ],
456
- temperature=0.1,
457
- max_tokens=1024,
458
- top_p=1,
459
- stream=False
460
- )
461
-
462
- return completion.choices[0].message.content
463
-
464
- def generate_pdf_analysis(documents):
465
- """Generate analysis from PDF documents using Groq"""
466
- try:
467
- # Combine all document content
468
- full_text = "\n".join([doc.text for doc in documents])
469
-
470
- # Generate analysis using Groq
471
- completion = client.chat.completions.create(
472
- model="llama-3.1-70b-versatile",
473
- messages=[
474
- {
475
- "role": "user",
476
- "content": (
477
- "Please analyze this government document and provide:\n"
478
- "1. Document Type and Purpose:\n"
479
- " - What kind of document is this?\n"
480
- " - What is its main purpose?\n\n"
481
- "2. Key Requirements:\n"
482
- " - What are the main requirements or conditions?\n"
483
- " - What documents or information are needed?\n\n"
484
- "3. Important Deadlines:\n"
485
- " - What are the key dates and deadlines?\n"
486
- " - Are there any time-sensitive requirements?\n\n"
487
- "4. Complex Terms Explained:\n"
488
- " - Explain any technical or legal terms in simple language\n"
489
- " - Clarify any complex procedures\n\n"
490
- "5. Required Actions:\n"
491
- " - What steps need to be taken?\n"
492
- " - What is the process to follow?\n\n"
493
- "6. Contact Information:\n"
494
- " - Who to contact for queries?\n"
495
- " - Where to submit the documents?\n\n"
496
- "Document content:\n" + full_text
497
- )
498
- }
499
- ],
500
- temperature=0.1,
501
- max_tokens=2048,
502
- top_p=1
503
- )
504
-
505
- # Format the analysis with proper styling
506
- analysis = completion.choices[0].message.content
507
-
508
- completionsum = client.chat.completions.create(
509
- model="llama-3.1-8b-instant",
510
- messages=[
511
- {
512
- "role": "user",
513
- "content": (
514
- "Summarize the following content: " + analysis
515
- )
516
- }
517
- ],
518
- temperature=0.1,
519
- max_tokens=2048,
520
- top_p=1
521
- )
522
-
523
- analysis = completionsum.choices[0].message.content
524
-
525
- # Add formatting for better readability
526
- formatted_analysis = (
527
- "<div class='analysis-container'>"
528
- "<div class='analysis-section'>" +
529
- analysis.replace('\n\n', '</div><div class="analysis-section">') +
530
- "</div>"
531
- "</div>"
532
- )
533
-
534
- return formatted_analysis
535
-
536
- except Exception as e:
537
- error_msg = "Error generating PDF analysis: " + str(e)
538
- raise Exception(error_msg)
539
-
540
- def clean_llm_output(output):
541
- """Clean LLM output by removing HTML tags and formatting symbols"""
542
- # Remove HTML tags
543
- cleaned_text = re.sub(r'<[^>]+>', '', output)
544
- # Remove double asterisks
545
- cleaned_text = cleaned_text.replace('**', '')
546
- cleaned_text = cleaned_text.replace('*', '')
547
- # Remove extra whitespace
548
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
549
- return cleaned_text.strip()
550
-
551
- def format_analysis_results(text):
552
- """Format analysis results into structured HTML"""
553
- # First clean the text
554
- cleaned_text = clean_llm_output(text)
555
-
556
- # Split into sections
557
- sections = []
558
- current_section = ""
559
- current_title = ""
560
-
561
- for line in cleaned_text.split('\n'):
562
- line = line.strip()
563
- if ':' in line and not line.startswith('*'):
564
- # If we have a previous section, save it
565
- if current_title:
566
- sections.append((current_title, current_section.strip()))
567
- # Start new section
568
- parts = line.split(':', 1)
569
- current_title = parts[0].strip()
570
- current_section = parts[1].strip() if len(parts) > 1 else ""
571
- else:
572
- current_section += " " + line
573
-
574
- # Add the last section
575
- if current_title:
576
- sections.append((current_title, current_section.strip()))
577
-
578
- # Generate HTML
579
- html = "<div class='analysis-results'>"
580
- for title, content in sections:
581
- html += f"""
582
- <div class='analysis-section card' style='margin-bottom: 1rem;'>
583
- <h4 style='color: #60A5FA; margin-bottom: 0.5rem;'>{title}</h4>
584
- <p style='margin: 0;'>{content}</p>
585
- </div>
586
- """
587
- html += "</div>"
588
-
589
- return html
590
-
591
- def process_captured_image(picture):
592
- """Process image captured from camera with mobile-friendly UI"""
593
- try:
594
- # Show processing status
595
- status_placeholder = st.empty()
596
- status_placeholder.markdown(
597
- "<div class='status-badge status-warning'>"
598
- "📸 Processing captured image..."
599
- "</div>",
600
- unsafe_allow_html=True
601
- )
602
-
603
- # Process the image
604
- image = Image.open(picture)
605
-
606
- # Display the captured image with proper mobile sizing
607
- st.image(
608
- image,
609
- caption="Captured Document",
610
- use_column_width=True # Makes image responsive
611
- )
612
-
613
- # Process image with AI
614
- with st.spinner("Analyzing document..."):
615
- analysis = process_image(image)
616
-
617
- # Generate filename with timestamp
618
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
619
- filename = f"captured_image_{timestamp}"
620
-
621
- # Save results
622
- st.session_state.analyses[filename] = {
623
- 'type': 'image/jpeg',
624
- 'analysis': analysis,
625
- 'timestamp': datetime.datetime.now()
626
- }
627
-
628
- # Create chat engine
629
- st.session_state.chat_engines[filename] = create_chat_engine(analysis)
630
-
631
- # Save to history
632
- save_to_history(
633
- filename,
634
- 'Captured Image',
635
- analysis,
636
- datetime.datetime.now()
637
- )
638
-
639
- # Update status to success
640
- status_placeholder.markdown(
641
- "<div class='status-badge status-success'>"
642
- "✅ Image analyzed successfully!"
643
- "</div>",
644
- unsafe_allow_html=True
645
- )
646
-
647
- # Display analysis results
648
- st.markdown(
649
- "<div class='card'>"
650
- "<h4>Analysis Results</h4>"
651
- f"<div style='margin: 1rem 0;'>{analysis}</div>"
652
- "</div>",
653
- unsafe_allow_html=True
654
- )
655
-
656
- # Mobile-friendly action buttons
657
- st.markdown("<div class='touch-spacing'>", unsafe_allow_html=True)
658
-
659
- col1, col2 = st.columns(2)
660
- with col1:
661
- if st.button("💬 Start Chat", use_container_width=True):
662
- st.session_state.current_doc = filename
663
- st.switch_page("pages/Document_Chat.py")
664
- with col2:
665
- if st.button("📸 New Capture", use_container_width=True):
666
- st.rerun()
667
-
668
- st.markdown("</div>", unsafe_allow_html=True)
669
-
670
- except Exception as e:
671
- st.error(
672
- "❌ Error processing image\n"
673
- f"Details: {str(e)}"
674
- )
675
-
676
- def process_pdf(pdf_file):
677
- """Process PDF document using LlamaIndex"""
678
- temp_dir = "temp_docs"
679
- os.makedirs(temp_dir, exist_ok=True)
680
- temp_path = os.path.join(temp_dir, "temp.pdf")
681
-
682
- with open(temp_path, "wb") as f:
683
- f.write(pdf_file.getvalue())
684
-
685
- try:
686
- reader = PDFReader()
687
- documents = reader.load_data(temp_path)
688
- return documents
689
- finally:
690
- if os.path.exists(temp_path):
691
- os.remove(temp_path)
692
- if os.path.exists(temp_dir) and not os.listdir(temp_dir):
693
- os.rmdir(temp_dir)
694
-
695
- def create_chat_engine(content):
696
- """Create chat engine from document content"""
697
- if isinstance(content, str):
698
- documents = [Document(text=content)]
699
- else:
700
- documents = content
701
-
702
- index = VectorStoreIndex.from_documents(documents)
703
- return index.as_chat_engine(chat_mode="condense_question", verbose=True)
704
-
705
- def generate_document(doc_type, fields):
706
- """Generate government documents based on type and fields"""
707
- prompt = f"""Generate a formal {doc_type} with the following details:
708
-
709
- {fields}
710
-
711
- Please format this as a proper official document following standard government formatting."""
712
-
713
- completion = client.chat.completions.create(
714
- model="llama-3.1-70b-versatile",
715
- messages=[
716
- {
717
- "role": "user",
718
- "content": prompt
719
- }
720
- ],
721
- temperature=0.7,
722
- max_tokens=2048,
723
- top_p=1
724
- )
725
-
726
- return completion.choices[0].message.content
727
-
728
- def save_to_history(doc_name, doc_type, content, timestamp=None):
729
- """Save document to history with metadata"""
730
- if timestamp is None:
731
- timestamp = datetime.now()
732
-
733
- st.session_state.document_history[doc_name] = {
734
- 'type': doc_type,
735
- 'content': content,
736
- 'timestamp': timestamp,
737
- 'status': 'Processed'
738
- }
739
-
740
- def get_document_history():
741
- """Retrieve document history sorted by timestamp"""
742
- history = st.session_state.document_history
743
- return dict(sorted(
744
- history.items(),
745
- key=lambda x: x[1]['timestamp'],
746
- reverse=True
747
- ))
748
-
749
- def delete_from_history(doc_name):
750
- """Delete document from history"""
751
- if doc_name in st.session_state.document_history:
752
- del st.session_state.document_history[doc_name]
753
- if doc_name in st.session_state.chat_engines:
754
- del st.session_state.chat_engines[doc_name]
755
- if doc_name in st.session_state.analyses:
756
- del st.session_state.analyses[doc_name]
757
- if st.session_state.current_doc == doc_name:
758
- st.session_state.current_doc = None
759
-
760
- def format_timestamp(timestamp):
761
- """Format timestamp for display"""
762
- return timestamp.strftime("%Y-%m-%d %H:%M:%S")
763
- >>>>>>> 1bc20a0d3edc7f88f03e506f84b01a7303d403b2
 
 
1
  import streamlit as st
2
  from groq import Groq
3
  import io
 
372
 
373
  def format_timestamp(timestamp):
374
  """Format timestamp for display"""
375
+ return timestamp.strftime("%Y-%m-%d %H:%M:%S")