File size: 84,944 Bytes
389d2a5 235982d 5060ae6 235982d 1948b4c 0a94f9d 5060ae6 f53835a 0a94f9d 1948b4c 5060ae6 1948b4c 235982d 5060ae6 1948b4c 5060ae6 1948b4c 5060ae6 1948b4c 5060ae6 58a4e61 235982d f53835a 235982d f53835a 235982d 5060ae6 58a4e61 5060ae6 58a4e61 f53835a 58a4e61 5060ae6 f53835a 58a4e61 235982d 58a4e61 235982d 58a4e61 f53835a 58a4e61 f53835a 58a4e61 f53835a 5060ae6 58a4e61 f53835a 235982d 58a4e61 235982d 5060ae6 235982d 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5114ebd 58a4e61 5060ae6 3722c2a b46f71f f53835a b46f71f 5060ae6 b46f71f 3722c2a b46f71f 5060ae6 b46f71f 5060ae6 b46f71f 5060ae6 b46f71f f53835a b46f71f 5114ebd b46f71f 5114ebd b46f71f 5114ebd b46f71f 5114ebd b46f71f f53835a 5114ebd 5060ae6 f53835a 3722c2a 5060ae6 f53835a 3722c2a 5060ae6 f53835a 3722c2a 5060ae6 3722c2a 5060ae6 3722c2a 5060ae6 3722c2a 5060ae6 3722c2a 5060ae6 3722c2a 5060ae6 3722c2a 5060ae6 1948b4c 5060ae6 f53835a 5060ae6 f53835a 5060ae6 1948b4c 235982d 1948b4c 235982d 5060ae6 58a4e61 5060ae6 58a4e61 5060ae6 58a4e61 3722c2a 58a4e61 3722c2a 58a4e61 5060ae6 58a4e61 3722c2a 58a4e61 5060ae6 58a4e61 3722c2a 58a4e61 3722c2a 58a4e61 5060ae6 3722c2a 5060ae6 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 ae013ef 58a4e61 5a3660d 58a4e61 5a3660d 58a4e61 5a3660d 58a4e61 3994daa ae013ef 5060ae6 1948b4c 235982d 1948b4c 235982d 5060ae6 235982d 5060ae6 235982d 5060ae6 235982d 1948b4c f53835a 1948b4c f53835a 1948b4c 5060ae6 1948b4c 5060ae6 f53835a ae013ef 235982d 1948b4c 235982d 1948b4c b46f71f 1948b4c b46f71f 1948b4c b46f71f 1948b4c 5060ae6 1948b4c 235982d 1948b4c 235982d 1948b4c 5060ae6 1948b4c 5060ae6 1948b4c 5060ae6 1948b4c 5060ae6 1948b4c 235982d 5060ae6 235982d 1948b4c f53835a 5060ae6 1948b4c ae013ef b46f71f 5114ebd b46f71f f53835a 1948b4c 5060ae6 1948b4c 235982d 1948b4c 5060ae6 1948b4c 235982d 1948b4c 235982d f53835a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 | import gradio as gr
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import random
# Use lighter BLIP model instead of heavy LLaVA
print("Loading BLIP model (lighter version)...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-large",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Universal Video Prompting Guide combining Gen-4 + SARA
unified_instructions = """
# π¬ Universal Video Prompting Guide
*Compatible with Gen-4, Sora, Pika, Luma, Runway and all diffusion-based video models*
## Core Principles (Universal)
β
**Focus on MOTION, not static description**
β
**Use positive phrasing exclusively**
β
**Start simple, iterate progressively**
β
**Refer to subjects in general terms** ("the subject," "the woman")
β
**Keep prompts direct and easily understood**
## Two Complementary Approaches
### π **Gen-4 Official Method** (Recommended for beginners)
**Structure**: Simple iterative building
1. Start with essential motion only
2. Add one element at a time: Subject Motion β Camera Motion β Scene Motion β Style Descriptors
3. Use general terms and avoid complex descriptions
**Example**:
- Basic: "The subject walks forward"
- + Camera: "The subject walks forward. Handheld camera follows"
- + Scene: "The subject walks forward. Handheld camera follows. Dust trails behind"
- + Style: "The subject walks forward. Handheld camera follows. Dust trails behind. Cinematic."
### π― **SARA Framework** (Advanced precision)
**Structure**: [Subject] + [Action] + [Reference] + [Atmosphere]
- **Subject (S)**: Main element to control
- **Action (A)**: Movement/transformation ([verb] + [adverb])
- **Reference (R)**: Spatial anchors ("while X remains steady")
- **Atmosphere (A)**: Context and style
**Template**: [Subject] [verb] [adverb] while [reference] [atmosphere]
**Example**: "The subject walks smoothly while background remains steady, cinematic atmosphere"
## Essential Vocabulary
### Effective Verbs (Action)
- **Movement**: walks, runs, moves, glides, flows, drifts
- **Rotation**: turns, spins, rotates, pivots, tilts
- **Transformation**: transforms, morphs, transitions, evolves
- **Expression**: speaks, gestures, looks, smiles, nods
### Effective Adverbs (Quality)
- **Speed**: slowly, quickly, gradually, suddenly, steadily
- **Style**: smoothly, naturally, elegantly, gracefully, dramatically
- **Intensity**: gently, softly, powerfully, intensely, subtly
### Camera Motion Terms
- **Basic**: locked camera, handheld, steady cam
- **Movement**: pan left/right, tilt up/down, zoom in/out, dolly forward/back
- **Advanced**: tracking shot, crane movement, orbital movement
### Style Descriptors
- **Aesthetic**: cinematic, live-action, smooth animation, stop motion
- **Mood**: dramatic, peaceful, energetic, mysterious, professional
- **Technical**: 4K, slow motion, time-lapse, documentary style
## Multi-Subject Guidelines
- **Positional**: "The subject on the left walks. The subject on the right remains still."
- **Descriptive**: "The woman nods. The man waves."
- **Sequential**: "The woman speaks then the man responds."
## Scene Motion Approaches
- **Insinuated**: "The subject runs across the dusty desert" (natural)
- **Explicit**: "The subject runs across the desert. Dust trails behind them" (emphasized)
## Proven Examples (from SARA Framework)
### Character Motion
- "The woman speaks enthusiastically to camera while camera remains still, online tutorial"
- "The subject transitions from walking to jumping while background stays constant"
### Camera Motion
- "The subject remains centered as camera smoothly moves left with steady background"
- "Handheld camera tracks the subject as they walk forward naturally"
### Environmental
- "Camera stays fixed while day cycles into night over the temple, stone structures remain still"
- "The red cup slides smoothly to the right on white table, maintaining background constant"
### Complex Scenes
- "The pile of rocks transforms into a humanoid made of rugged volcanic rocks. The rock humanoid walks around"
- "The woman inspects her reflection in mirror. Surface bubbles with translucent bubbles. Locked camera"
## Technical Notes
- **Gen-4/Runway**: Prefer SARA structure for precision
- **Sora/OpenAI**: Works well with both approaches
- **Pika/Stable**: Gen-4 method often more effective
- **All models**: Start simple, iterate based on results
"""
# Prompt templates from both Gen-4 and SARA research
SARA_TEMPLATES = {
"character_motion": [
"{subject} speaks {adverb} to camera while camera remains still, {genre}",
"{subject} {action} {adverb} while background stays constant, {style}",
"{subject} transitions from {action1} to {action2} while frame remains fixed, {genre}"
],
"camera_motion": [
"{subject} remains centered as camera {movement} {adverb} with steady background",
"{camera_type} camera {action} the {subject} as they {movement} {adverb}",
"Camera {movement} {adverb} while {subject} maintains position, {style}"
],
"environmental": [
"Camera stays fixed while {environment} {transformation} over {subject}, {reference} remain still",
"{subject} {action} while {environmental_effect} around them, {style}",
"{environmental_element} {movement} {adverb} as {subject} {action}, maintaining {reference}"
],
"transformations": [
"{object} transforms into {new_form} made of {material}. The {new_subject} {action} around",
"{subject} {action} in {location}. {environmental_reaction} {adverb}. {camera_style}",
"The {subject} {action} while {environmental_change} occurs {adverb}, {atmosphere}"
]
}
GEN4_TEMPLATES = {
"basic": [
"The subject {action}",
"The {subject} {movement} {direction}",
"{subject} {expression} to camera"
],
"with_camera": [
"The subject {action}. {camera_movement}",
"{subject} {movement} {direction}. Camera {camera_action}",
"Handheld camera {camera_behavior} as {subject} {action}"
],
"with_scene": [
"The subject {action}. {camera_movement}. {scene_element} {scene_action}",
"{subject} {movement} across {environment}. {environmental_reaction}",
"Camera {camera_movement} while {subject} {action}, {scene_description}"
],
"complete": [
"The subject {action}. {camera_movement}. {scene_element} {scene_action}. {style}",
"{subject} {movement} {adverb} across {environment}. {camera_type} camera {camera_action}. {style}",
"Camera {camera_movement} as {subject} {action}, {environmental_reaction}, {atmosphere}"
]
}
# Vocabulary databases
VOCABULARY = {
"subjects": ["the subject", "the woman", "the man", "the person", "the character"],
"actions": ["walks", "runs", "moves", "glides", "flows", "turns", "speaks", "gestures"],
"adverbs": ["smoothly", "slowly", "quickly", "naturally", "gracefully", "steadily", "gently"],
"camera_movements": ["locked camera", "handheld", "dolly forward", "pan left", "pan right", "tracking shot"],
"environments": ["dusty desert", "forest", "urban street", "open field", "indoor space"],
"styles": ["cinematic", "documentary", "live-action", "dramatic", "peaceful", "energetic"]
}
def analyze_image_simple(image):
"""Enhanced image analysis using BLIP + AI reasoning"""
if image is None:
return "Please upload an image first.", "", {}
try:
# Convert to PIL if needed
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
# Get basic image info
width, height = image.size
aspect_ratio = width / height
if aspect_ratio > 1.5:
composition = "Wide landscape shot"
elif aspect_ratio < 0.7:
composition = "Vertical portrait shot"
else:
composition = "Balanced composition"
# Generate caption with BLIP
inputs = processor(image, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
model.cuda()
out = model.generate(**inputs, max_length=50, num_beams=3)
basic_caption = processor.decode(out[0], skip_special_tokens=True)
# Enhanced analysis using AI reasoning
enhanced_analysis = analyze_scene_with_ai(basic_caption, aspect_ratio, composition)
# Create comprehensive analysis text
analysis = f"""π **Image Analysis:**
β’ **Dimensions**: {width} x {height}
β’ **Composition**: {composition}
β’ **Aspect Ratio**: {aspect_ratio:.2f}
π¨ **Basic Description**:
"{basic_caption}"
π§ **AI-Enhanced Analysis**:
{enhanced_analysis['scene_interpretation']}
π‘ **Motion & Cinematography Insights**:
{chr(10).join(f"β’ {insight}" for insight in enhanced_analysis['motion_insights'])}
π― **Recommended Approach**:
{enhanced_analysis['recommended_approach']}"""
# Enhanced scene info for prompt generation
scene_info = {
'basic_description': basic_caption,
'enhanced_description': enhanced_analysis['detailed_description'],
'composition': composition,
'aspect_ratio': aspect_ratio,
'has_person': enhanced_analysis['has_person'],
'emotional_tone': enhanced_analysis['emotional_tone'],
'visual_style': enhanced_analysis['visual_style'],
'setting': enhanced_analysis['setting'],
'distinctive_elements': enhanced_analysis['distinctive_elements'],
'motion_potential': enhanced_analysis['motion_potential'],
'cinematic_qualities': enhanced_analysis['cinematic_qualities']
}
return analysis, basic_caption, scene_info
except Exception as e:
return f"Error analyzing image: {str(e)}", "", {}
def analyze_scene_with_ai(basic_caption, aspect_ratio, composition):
"""Use AI reasoning to enhance basic image analysis"""
text = basic_caption.lower() if isinstance(basic_caption, str) else ""
# Interpret the scene beyond basic description
scene_elements = extract_scene_elements(text)
# Determine emotional tone and mood
emotional_tone = determine_emotional_tone(text, scene_elements)
# Analyze visual style potential
visual_style = determine_visual_style(text, scene_elements, composition)
# Identify distinctive elements for video potential
distinctive_elements = identify_distinctive_elements(text)
# Assess motion potential
motion_potential = assess_motion_potential(text, scene_elements)
# Generate cinematic insights
cinematic_qualities = analyze_cinematic_potential(text, composition, aspect_ratio)
# Create enhanced interpretation
enhanced_description = create_enhanced_description(basic_caption, scene_elements, emotional_tone)
# Generate motion and cinematography insights
motion_insights = generate_motion_insights(scene_elements, emotional_tone, visual_style, composition)
# Recommend best approach
recommended_approach = recommend_approach(scene_elements, emotional_tone, visual_style)
return {
'detailed_description': enhanced_description,
'scene_interpretation': f"Scene shows {scene_elements['subject']} in {scene_elements['setting']} with {emotional_tone} mood. Key elements: {', '.join(distinctive_elements)}",
'motion_insights': motion_insights,
'recommended_approach': recommended_approach,
'has_person': scene_elements['has_person'],
'emotional_tone': emotional_tone,
'visual_style': visual_style,
'setting': scene_elements['setting'],
'distinctive_elements': distinctive_elements,
'motion_potential': motion_potential,
'cinematic_qualities': cinematic_qualities
}
def extract_scene_elements(text):
"""Extract and interpret scene elements intelligently"""
elements = {
'subject': 'subject',
'setting': 'neutral',
'clothing': None,
'colors': [],
'objects': [],
'has_person': False
}
# Detect subjects with context
if any(word in text for word in ['man', 'male', 'gentleman']):
elements['subject'] = 'man'
elements['has_person'] = True
# Detect what the man is wearing/doing
if 'costume' in text:
elements['subject'] = 'man in costume'
elements['clothing'] = 'costume'
elif 'suit' in text:
elements['subject'] = 'man in suit'
elements['clothing'] = 'suit'
elif any(word in text for word in ['woman', 'female', 'lady']):
elements['subject'] = 'woman'
elements['has_person'] = True
if 'dress' in text:
elements['subject'] = 'woman in dress'
elements['clothing'] = 'dress'
# Detect setting with intelligence
if any(word in text for word in ['outdoor', 'outside', 'street', 'nature', 'park']):
elements['setting'] = 'outdoor'
elif any(word in text for word in ['indoor', 'inside', 'room', 'office', 'studio']):
elements['setting'] = 'indoor'
elif any(word in text for word in ['stage', 'performance']):
elements['setting'] = 'performance'
# Extract colors intelligently
color_words = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange', 'gold', 'silver']
elements['colors'] = [color for color in color_words if color in text]
# Extract objects
objects = ['hat', 'cape', 'flag', 'chair', 'table', 'background', 'wall']
elements['objects'] = [obj for obj in objects if obj in text]
return elements
def determine_emotional_tone(text, scene_elements):
"""Intelligently determine the emotional tone of the scene"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
text_lower = text.lower()
# Look for emotional indicators
if any(word in text_lower for word in ['serious', 'formal', 'stern', 'professional']):
return 'serious'
elif any(word in text_lower for word in ['happy', 'smiling', 'cheerful', 'joyful']):
return 'cheerful'
elif any(word in text_lower for word in ['dramatic', 'intense', 'powerful', 'bold']):
return 'dramatic'
elif any(word in text_lower for word in ['elegant', 'graceful', 'refined']):
return 'elegant'
elif 'costume' in text_lower or 'performance' in text_lower:
return 'theatrical'
else:
# Infer from context
if scene_elements['setting'] == 'performance':
return 'theatrical'
elif scene_elements['clothing'] in ['suit', 'formal']:
return 'professional'
else:
return 'neutral'
def determine_visual_style(text, scene_elements, composition):
"""Determine the most suitable visual style"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
text_lower = text.lower()
# Analyze scene for style cues
if scene_elements['setting'] == 'performance' or 'costume' in text_lower:
return 'theatrical'
elif scene_elements['setting'] == 'indoor' and 'formal' in text_lower:
return 'professional'
elif composition in ['Wide landscape shot']:
return 'cinematic'
elif any(color in scene_elements['colors'] for color in ['red', 'gold', 'dramatic']):
return 'dramatic'
else:
return 'cinematic'
def identify_distinctive_elements(text):
"""Identify unique elements that can enhance video prompts"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
elements = []
text_lower = text.lower()
# Clothing and costume elements
if 'costume' in text_lower:
elements.append('elaborate costume')
if 'cape' in text_lower:
elements.append('flowing cape')
if 'hat' in text_lower:
elements.append('distinctive hat')
if 'flag' in text_lower:
elements.append('flag detail')
# Color elements
colors = ['red', 'blue', 'green', 'gold']
found_colors = [color for color in colors if color in text_lower]
if found_colors:
elements.append(f"{', '.join(found_colors)} coloring")
# Setting elements
if 'background' in text_lower:
elements.append('detailed background')
return elements if elements else ['natural elements']
def assess_motion_potential(text, scene_elements):
"""Assess what types of motion would work best"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
potential = []
text_lower = text.lower()
if scene_elements['has_person']:
potential.extend(['facial expressions', 'hand gestures', 'body movement'])
if scene_elements['clothing'] == 'costume':
potential.append('costume dynamics')
if scene_elements['clothing'] == 'cape':
potential.append('cape flow')
if scene_elements['clothing'] == 'dress':
potential.append('fabric movement')
if scene_elements['setting'] == 'outdoor':
potential.extend(['environmental effects', 'natural lighting changes'])
elif scene_elements['setting'] == 'indoor':
potential.extend(['controlled lighting', 'subtle environment shifts'])
return potential
def analyze_cinematic_potential(text, composition, aspect_ratio):
"""Analyze the cinematic qualities and potential"""
# Ensure text is a string
if not isinstance(text, str):
text = ""
qualities = []
text_lower = text.lower()
# Composition analysis
if composition == 'Wide landscape shot':
qualities.extend(['horizontal camera movements', 'panoramic reveals', 'environmental context'])
elif composition == 'Vertical portrait shot':
qualities.extend(['character focus', 'intimate framing', 'vertical movement'])
else:
qualities.extend(['balanced framing', 'versatile movement', 'centered composition'])
# Content analysis
if 'costume' in text_lower or 'dramatic' in text_lower:
qualities.append('dramatic lighting potential')
if any(color in text_lower for color in ['red', 'gold', 'rich']):
qualities.append('color enhancement opportunities')
return qualities
def create_enhanced_description(basic_caption, scene_elements, emotional_tone):
"""Create a richer description using AI analysis"""
subject = scene_elements['subject']
setting = scene_elements['setting']
clothing = scene_elements['clothing']
enhanced = f"A {emotional_tone} scene featuring {subject}"
if clothing:
enhanced += f" wearing {clothing}"
enhanced += f" in a {setting} setting"
if scene_elements['colors']:
enhanced += f" with prominent {', '.join(scene_elements['colors'])} elements"
return enhanced
def generate_motion_insights(scene_elements, emotional_tone, visual_style, composition):
"""Generate intelligent motion and cinematography insights"""
insights = []
# Subject-based insights
if scene_elements['has_person']:
if emotional_tone == 'dramatic':
insights.append('Emphasize powerful gestures and dynamic poses')
elif emotional_tone == 'elegant':
insights.append('Focus on graceful, refined movements')
elif emotional_tone == 'theatrical':
insights.append('Capture performance-style expressions and gestures')
if scene_elements['clothing']:
clothing = scene_elements['clothing']
if clothing == 'costume':
insights.append('Highlight costume details with movement')
elif clothing == 'cape':
insights.append('Showcase cape flow and dramatic movement')
elif clothing == 'dress':
insights.append('Capture fabric dynamics and elegant motion')
# Composition-based insights
if composition == 'Wide landscape shot':
insights.append('Utilize horizontal camera movements and wide reveals')
elif composition == 'Vertical portrait shot':
insights.append('Focus on vertical movement and character detail')
# Style-based insights
if visual_style == 'cinematic':
insights.append('Use cinematic camera techniques and dramatic lighting')
elif visual_style == 'dramatic':
insights.append('Emphasize bold movements and high contrast lighting')
elif visual_style == 'professional':
insights.append('Maintain clean, controlled camera work')
# Color-based insights
if scene_elements['colors']:
insights.append(f"Enhance {', '.join(scene_elements['colors'])} tones through lighting")
return insights[:6] # Limit to 6 most relevant insights
def recommend_approach(scene_elements, emotional_tone, visual_style):
"""Intelligently recommend the best prompting approach"""
# For complex scenes with people in costume/formal wear
if scene_elements['has_person'] and scene_elements['clothing'] in ['costume', 'suit', 'dress']:
return "SARA Framework recommended for precise character and costume control"
# For dramatic or theatrical scenes
elif emotional_tone in ['dramatic', 'theatrical']:
return "SARA Framework ideal for complex dramatic scenes with multiple elements"
# For simple, natural scenes
elif emotional_tone in ['neutral', 'peaceful'] and visual_style != 'dramatic':
return "Gen-4 method perfect for natural, iterative scene building"
# For professional or formal contexts
elif emotional_tone == 'professional' or visual_style == 'professional':
return "Either approach works - SARA for precision, Gen-4 for simplicity"
else:
return "Start with Gen-4 for base prompt, then refine with SARA for complexity"
def generate_motion_suggestions(description, aspect_ratio):
"""Generate contextual motion suggestions"""
text = description.lower()
suggestions = []
# Content-based suggestions
if any(word in text for word in ['person', 'woman', 'man', 'people']):
suggestions.extend([
'Focus on character expressions and gestures',
'Use "the subject" or "the woman/man" for clarity',
'Consider handheld camera for natural movement'
])
if any(word in text for word in ['sitting', 'standing']):
suggestions.extend([
'Start with simple movements: speaking, gesturing',
'Locked or steady camera works well for portraits'
])
if any(word in text for word in ['outdoor', 'landscape', 'nature']):
suggestions.extend([
'Camera movement can explore the environment',
'Consider environmental motion: wind, clouds',
'Cinematic style complements outdoor scenes'
])
if any(word in text for word in ['indoor', 'room']):
suggestions.extend([
'Controlled movements work best indoors',
'Focus on subject motion within the space'
])
# Composition-based suggestions
if aspect_ratio > 1.5:
suggestions.append('Wide format perfect for horizontal camera movements')
elif aspect_ratio < 0.8:
suggestions.append('Portrait format ideal for character-focused content')
return suggestions[:6] if suggestions else [
'Start with simple motion: "The subject moves"',
'Add camera movement: "Camera follows naturally"',
'Include environment: "Background remains steady"'
]
def get_recommended_approach(description):
"""Recommend best approach based on image content"""
text = description.lower()
if any(word in text for word in ['person', 'woman', 'man']):
return "SARA Framework recommended for character precision"
elif any(word in text for word in ['landscape', 'building', 'nature']):
return "Gen-4 method works well for environmental scenes"
else:
return "Try both approaches - start with Gen-4, refine with SARA"
def detect_setting(description):
"""Detect setting type from description"""
text = description.lower()
if any(word in text for word in ['outdoor', 'outside', 'street', 'nature']):
return 'outdoor'
elif any(word in text for word in ['indoor', 'inside', 'room', 'building']):
return 'indoor'
else:
return 'neutral'
def extract_specific_details(description):
"""Extract specific details from the image description"""
details = {
'colors': [],
'clothing': None,
'distinctive_feature': None,
'main_object': None,
'setting_clues': []
}
text = description.lower()
# Extract colors
colors = ['red', 'blue', 'green', 'yellow', 'black', 'white', 'brown', 'pink', 'purple', 'orange']
found_colors = [color for color in colors if color in text]
if found_colors:
details['colors'] = ', '.join(found_colors)
# Extract clothing/costume details
clothing_items = ['cape', 'hat', 'dress', 'suit', 'shirt', 'coat', 'jacket', 'uniform', 'costume', 'robe']
for item in clothing_items:
if item in text:
# Get the full clothing description
if 'red cape' in text:
details['clothing'] = 'red cape'
details['distinctive_feature'] = 'flowing red cape'
elif 'hat' in text:
if 'red hat' in text:
details['clothing'] = 'red hat'
details['distinctive_feature'] = 'red hat'
else:
details['clothing'] = 'hat'
details['distinctive_feature'] = 'hat'
else:
details['clothing'] = item
details['distinctive_feature'] = item
break
# Extract main subject
if 'man' in text:
details['main_object'] = 'man'
elif 'woman' in text:
details['main_object'] = 'woman'
elif 'person' in text:
details['main_object'] = 'person'
elif 'people' in text:
details['main_object'] = 'people'
# Extract setting clues
setting_indicators = ['outdoor', 'indoor', 'street', 'room', 'building', 'nature', 'park', 'office']
details['setting_clues'] = [indicator for indicator in setting_indicators if indicator in text]
return details
def get_contextual_subject(description, details):
"""Get appropriate subject reference based on image content"""
text = description.lower()
if 'man' in text:
if details.get('clothing'):
return f"The man in the {details['clothing']}"
else:
return "The man"
elif 'woman' in text:
if details.get('clothing'):
return f"The woman in the {details['clothing']}"
else:
return "The woman"
elif 'person' in text:
return "The person"
else:
return "The subject"
def get_contextual_actions(description, details):
"""Get actions appropriate for the scene"""
base_actions = ['speaks', 'gestures', 'moves', 'looks', 'turns']
# Add context-specific actions
if details.get('clothing'):
if 'cape' in details['clothing']:
base_actions.extend(['adjusts cape', 'moves dramatically', 'gestures with cape flowing'])
if 'hat' in details['clothing']:
base_actions.extend(['tips hat', 'adjusts hat', 'nods with hat'])
# Add character-appropriate actions
if 'man' in description.lower():
base_actions.extend(['speaks confidently', 'gestures authoritatively'])
return base_actions
def get_contextual_adverbs(details):
"""Get adverbs that fit the scene"""
base_adverbs = ['naturally', 'smoothly', 'slowly', 'gracefully']
if details.get('clothing'):
if 'cape' in details['clothing']:
base_adverbs.extend(['dramatically', 'majestically', 'with flair'])
if 'hat' in details['clothing']:
base_adverbs.extend(['elegantly', 'with style', 'confidently'])
return base_adverbs
def get_contextual_camera_movement(description, details):
"""Get camera movements appropriate for the scene"""
base_movements = ['Camera follows steadily', 'Locked camera captures', 'Handheld camera tracks']
if details.get('distinctive_feature'):
if 'cape' in details['distinctive_feature']:
base_movements.extend(['Camera captures cape movement', 'Tracking shot follows cape flow'])
if 'hat' in details['distinctive_feature']:
base_movements.extend(['Camera frames from chest up', 'Close tracking of upper body'])
return base_movements
def get_contextual_environment(description, details):
"""Get environmental effects that complement the scene"""
if details.get('colors'):
if 'red' in details['colors']:
return "lighting enhances red tones"
if details.get('clothing'):
if 'cape' in details['clothing']:
return "cape fabric reacts to air movement"
return None
def get_contextual_style(details):
"""Get style that fits the scene context"""
if details.get('clothing'):
if 'cape' in details['clothing']:
return "dramatic cinematic style"
if 'hat' in details['clothing']:
return "classic portrait style"
return "professional documentary style"
def get_contextual_atmosphere(details):
"""Get atmosphere that matches the scene"""
if details.get('colors'):
if 'red' in details['colors']:
return "dramatic atmosphere with rich red tones"
if details.get('clothing'):
if 'cape' in details['clothing']:
return "heroic cinematic atmosphere"
if 'hat' in details['clothing']:
return "elegant portrait atmosphere"
return "professional cinematic atmosphere"
def optimize_user_prompt(user_idea, scene_info=None):
"""Optimize and structure user's prompt idea into professional video prompt"""
if not user_idea.strip():
return "Please enter your idea first."
try:
# Analyze the user's input
idea = user_idea.strip()
# Detect language and content
analysis = analyze_user_idea(idea)
# Generate optimized prompt
optimized = create_optimized_prompt(idea, analysis, scene_info)
return optimized
except Exception as e:
return f"Error optimizing prompt: {str(e)}"
def analyze_user_idea(idea):
"""Analyze user's idea to understand intent and content"""
idea_lower = idea.lower()
analysis = {
'language': detect_language(idea),
'has_action': False,
'has_object': False,
'has_emotion': False,
'has_camera': False,
'complexity': 'simple',
'main_elements': []
}
# Detect actions (multilingual)
action_words = {
'en': ['removes', 'takes off', 'puts on', 'walks', 'runs', 'speaks', 'gestures', 'moves', 'turns', 'looks'],
'es': ['quita', 'se quita', 'pone', 'camina', 'corre', 'habla', 'gesticula', 'mueve', 'gira', 'mira'],
'fr': ['enlève', 'met', 'marche', 'court', 'parle', 'gesticule', 'bouge'],
'de': ['nimmt ab', 'zieht aus', 'geht', 'lΓ€uft', 'spricht', 'bewegt']
}
for lang, actions in action_words.items():
if any(action in idea_lower for action in actions):
analysis['has_action'] = True
break
# Detect objects/elements
object_words = ['nose', 'nariz', 'hat', 'sombrero', 'costume', 'traje', 'cape', 'capa', 'mask', 'mΓ‘scara']
if any(obj in idea_lower for obj in object_words):
analysis['has_object'] = True
# Detect emotions/style
emotion_words = ['dramatic', 'dramΓ‘tico', 'slow', 'lento', 'fast', 'rΓ‘pido', 'gentle', 'suave', 'powerful', 'poderoso']
if any(emotion in idea_lower for emotion in emotion_words):
analysis['has_emotion'] = True
# Detect camera references
camera_words = ['camera', 'cΓ‘mara', 'shot', 'toma', 'angle', 'Γ‘ngulo', 'close', 'cerca', 'wide', 'amplio']
if any(camera in idea_lower for camera in camera_words):
analysis['has_camera'] = True
# Determine complexity
element_count = sum([analysis['has_action'], analysis['has_object'], analysis['has_emotion'], analysis['has_camera']])
if element_count >= 3:
analysis['complexity'] = 'complex'
elif element_count >= 2:
analysis['complexity'] = 'medium'
return analysis
def detect_language(text):
"""Simple language detection"""
spanish_indicators = ['el', 'la', 'se', 'que', 'con', 'por', 'para', 'del', 'de la', 'nariz', 'payaso']
french_indicators = ['le', 'la', 'se', 'que', 'avec', 'pour', 'du', 'de la', 'nez', 'clown']
german_indicators = ['der', 'die', 'das', 'sich', 'mit', 'fΓΌr', 'vom', 'nase', 'clown']
text_lower = text.lower()
if any(indicator in text_lower for indicator in spanish_indicators):
return 'spanish'
elif any(indicator in text_lower for indicator in french_indicators):
return 'french'
elif any(indicator in text_lower for indicator in german_indicators):
return 'german'
else:
return 'english'
def create_optimized_prompt(idea, analysis, scene_info=None):
"""Create optimized English video prompt from user idea"""
# Translation dictionary for common elements
translations = {
'spanish': {
'se quita': 'removes',
'quita': 'removes',
'pone': 'puts on',
'camina': 'walks',
'habla': 'speaks',
'mueve': 'moves',
'nariz': 'nose',
'payaso': 'clown',
'personaje': 'character',
'sombrero': 'hat',
'capa': 'cape',
'lentamente': 'slowly',
'rΓ‘pidamente': 'quickly',
'dramΓ‘ticamente': 'dramatically'
},
'french': {
'enlève': 'removes',
'met': 'puts on',
'marche': 'walks',
'parle': 'speaks',
'bouge': 'moves',
'nez': 'nose',
'clown': 'clown',
'personnage': 'character',
'chapeau': 'hat',
'cape': 'cape'
}
}
# Start with basic translation
optimized_idea = idea
if analysis['language'] in translations:
for original, translation in translations[analysis['language']].items():
optimized_idea = optimized_idea.replace(original, translation)
# Structure the prompt professionally
structured_prompt = structure_video_prompt(optimized_idea, analysis, scene_info)
return structured_prompt
def structure_video_prompt(idea, analysis, scene_info=None):
"""Structure the idea into a professional video prompt"""
# Extract main elements
idea_lower = idea.lower()
# Identify subject
if 'character' in idea_lower or 'personaje' in idea_lower:
subject = "The character"
elif 'person' in idea_lower or 'persona' in idea_lower:
subject = "The person"
elif scene_info and scene_info.get('has_person'):
# Use context from scene analysis
subject = extract_intelligent_subject_reference(scene_info)
else:
subject = "The subject"
# Extract and optimize action
action = extract_action_from_idea(idea)
# Add appropriate style modifiers
if analysis['complexity'] == 'simple':
# Simple structure: Subject + Action + naturally
optimized = f"{subject} {action} naturally"
# Add camera suggestion
optimized += ". Camera captures the motion smoothly"
elif analysis['complexity'] == 'medium':
# Medium structure: Add more detail
optimized = f"{subject} {action} while camera follows steadily"
# Add environmental/lighting
if analysis['has_emotion']:
optimized += ", dramatic lighting enhances the mood"
else:
optimized += ", professional lighting"
else:
# Complex structure: Full SARA framework
optimized = f"{subject} {action} expressively while camera tracks the motion"
optimized += ", lighting and environment support the action, cinematic atmosphere"
# Add technical improvements
optimized = improve_technical_language(optimized)
return optimized
def extract_action_from_idea(idea):
"""Extract and refine the main action from user's idea"""
idea_lower = idea.lower()
# Map common actions to video-optimized versions
action_mappings = {
'removes': 'removes',
'quita': 'removes',
'se quita': 'removes',
'takes off': 'removes',
'puts on': 'puts on',
'pone': 'puts on',
'walks': 'walks',
'camina': 'walks',
'speaks': 'speaks',
'habla': 'speaks',
'moves': 'moves',
'mueve': 'moves',
'turns': 'turns',
'gira': 'turns',
'looks': 'looks',
'mira': 'looks'
}
# Find the action and object
action = "moves" # default
object_part = ""
for original, mapped in action_mappings.items():
if original in idea_lower:
action = mapped
# Try to extract what's being acted upon
if original in ['removes', 'quita', 'se quita', 'takes off']:
# Look for what's being removed
if 'nose' in idea_lower or 'nariz' in idea_lower:
if 'clown' in idea_lower or 'payaso' in idea_lower:
object_part = "the clown nose"
else:
object_part = "the nose piece"
elif 'hat' in idea_lower or 'sombrero' in idea_lower:
object_part = "the hat"
elif 'mask' in idea_lower or 'mΓ‘scara' in idea_lower:
object_part = "the mask"
break
# Combine action with object
if object_part:
return f"{action} {object_part}"
else:
return action
def improve_technical_language(prompt):
"""Improve the prompt with professional video terminology"""
# Enhance basic terms
improvements = {
'moves naturally': 'moves with natural grace',
'Camera captures': 'Camera captures',
'smoothly': 'with smooth motion',
'follows steadily': 'follows with steady tracking',
'dramatic lighting': 'dramatic lighting transitions',
'professional lighting': 'professional lighting setup',
'cinematic atmosphere': 'rich cinematic atmosphere'
}
improved_prompt = prompt
for basic, enhanced in improvements.items():
improved_prompt = improved_prompt.replace(basic, enhanced)
return improved_prompt
def refine_prompt_with_feedback(current_prompt, feedback, chat_history, scene_info=None):
"""Use AI to intelligently refine prompts based on user feedback"""
if not feedback.strip():
return current_prompt, chat_history
# Analyze the feedback with AI understanding
refinement_analysis = analyze_refinement_request(feedback, current_prompt, scene_info)
# Generate intelligent refinement
refined_prompt = apply_intelligent_refinement(current_prompt, refinement_analysis, scene_info)
# Create explanatory response
explanation = create_refinement_explanation(refinement_analysis, current_prompt, refined_prompt)
# Update chat history with intelligent conversation
new_chat_history = chat_history + [
[feedback, f"π€ {explanation}\n\nβ¨ **Refined Prompt**: {refined_prompt}"]
]
return refined_prompt, new_chat_history
def analyze_refinement_request(feedback, current_prompt, scene_info):
"""Analyze what the user wants to change using AI understanding"""
feedback_lower = feedback.lower()
analysis = {
'request_type': 'general',
'intensity': 'moderate',
'focus_area': 'action',
'style_preference': None,
'specific_elements': [],
'language': detect_language(feedback)
}
# Detect request type with AI understanding
if any(word in feedback_lower for word in ['dramatic', 'dramΓ‘tico', 'dramatique', 'dramatisch']):
analysis['request_type'] = 'dramatic'
analysis['intensity'] = 'high'
elif any(word in feedback_lower for word in ['slow', 'slower', 'lento', 'mΓ‘s lento', 'lentement']):
analysis['request_type'] = 'pace'
analysis['intensity'] = 'slow'
elif any(word in feedback_lower for word in ['fast', 'faster', 'rΓ‘pido', 'mΓ‘s rΓ‘pido', 'rapide']):
analysis['request_type'] = 'pace'
analysis['intensity'] = 'fast'
elif any(word in feedback_lower for word in ['camera', 'cΓ‘mara', 'camΓ©ra', 'kamera']):
analysis['request_type'] = 'camera'
analysis['focus_area'] = 'cinematography'
elif any(word in feedback_lower for word in ['lighting', 'light', 'luz', 'lumière', 'licht']):
analysis['request_type'] = 'lighting'
analysis['focus_area'] = 'atmosphere'
elif any(word in feedback_lower for word in ['simple', 'simpler', 'mΓ‘s simple', 'plus simple']):
analysis['request_type'] = 'simplify'
analysis['intensity'] = 'low'
elif any(word in feedback_lower for word in ['complex', 'complicated', 'detalle', 'detail', 'dΓ©tail']):
analysis['request_type'] = 'elaborate'
analysis['intensity'] = 'high'
elif any(word in feedback_lower for word in ['elegant', 'elegante', 'Γ©lΓ©gant']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'elegant'
elif any(word in feedback_lower for word in ['powerful', 'poderoso', 'puissant']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'powerful'
elif any(word in feedback_lower for word in ['natural', 'natural', 'naturel']):
analysis['request_type'] = 'style'
analysis['style_preference'] = 'natural'
# Detect specific elements mentioned
elements = ['costume', 'dress', 'cape', 'hat', 'background', 'face', 'hands', 'movement']
for element in elements:
if element in feedback_lower:
analysis['specific_elements'].append(element)
return analysis
def apply_intelligent_refinement(current_prompt, analysis, scene_info):
"""Apply intelligent refinement based on analysis"""
# Start with current prompt
refined = current_prompt
# Apply refinements based on request type
if analysis['request_type'] == 'dramatic':
refined = enhance_dramatic_elements(refined, analysis, scene_info)
elif analysis['request_type'] == 'pace':
refined = adjust_pace(refined, analysis)
elif analysis['request_type'] == 'camera':
refined = enhance_camera_work(refined, analysis, scene_info)
elif analysis['request_type'] == 'lighting':
refined = enhance_lighting(refined, analysis, scene_info)
elif analysis['request_type'] == 'simplify':
refined = simplify_prompt(refined)
elif analysis['request_type'] == 'elaborate':
refined = elaborate_prompt(refined, scene_info)
elif analysis['request_type'] == 'style':
refined = apply_style_preference(refined, analysis, scene_info)
else:
# General enhancement
refined = apply_general_enhancement(refined, analysis, scene_info)
return refined
def enhance_dramatic_elements(prompt, analysis, scene_info):
"""Enhance dramatic elements intelligently"""
# Replace gentle actions with dramatic ones
dramatic_replacements = {
'naturally': 'dramatically with intensity',
'smoothly': 'with powerful emphasis',
'gently': 'boldly',
'moves': 'commands attention',
'speaks': 'declares passionately',
'gestures': 'gestures with commanding presence',
'professional lighting': 'dramatic lighting with stark contrasts',
'cinematic lighting': 'theatrical lighting with deep shadows'
}
enhanced = prompt
for original, dramatic in dramatic_replacements.items():
enhanced = enhanced.replace(original, dramatic)
# Add dramatic elements based on scene context
if scene_info and scene_info.get('distinctive_elements'):
elements = scene_info['distinctive_elements']
if 'costume' in str(elements):
enhanced += ". Costume elements amplify the dramatic presence"
if 'cape' in str(elements):
enhanced += ". Cape billows dramatically with movement"
# Enhance camera work for drama
if 'Camera captures' in enhanced:
enhanced = enhanced.replace('Camera captures', 'Dynamic camera captures')
return enhanced
def adjust_pace(prompt, analysis):
"""Adjust the pace of action"""
if analysis['intensity'] == 'slow':
pace_replacements = {
'naturally': 'slowly and deliberately',
'smoothly': 'in measured slow motion',
'moves': 'moves with deliberate slowness',
'speaks': 'speaks thoughtfully',
'gestures': 'gestures with careful precision'
}
else: # fast
pace_replacements = {
'naturally': 'with energetic quickness',
'slowly': 'rapidly',
'smoothly': 'with swift fluidity',
'deliberate': 'rapid',
'measured': 'quick'
}
adjusted = prompt
for original, paced in pace_replacements.items():
adjusted = adjusted.replace(original, paced)
return adjusted
def enhance_camera_work(prompt, analysis, scene_info):
"""Enhance camera work based on scene context"""
# Analyze current camera work
enhanced = prompt
# Upgrade basic camera work
camera_enhancements = {
'Camera captures': 'Dynamic camera work captures',
'camera follows': 'cinematic camera tracks',
'handheld camera': 'fluid handheld camera movement',
'steady camera': 'precision camera operation',
'locked camera': 'artistically locked camera'
}
for basic, enhanced_version in camera_enhancements.items():
enhanced = enhanced.replace(basic, enhanced_version)
# Add specific camera techniques based on scene
if scene_info:
composition = scene_info.get('composition', '')
if 'Wide' in composition:
enhanced += ". Wide tracking shots reveal environmental context"
elif 'Portrait' in composition:
enhanced += ". Intimate camera framing emphasizes character details"
# If no camera work exists, add it
if 'camera' not in enhanced.lower():
enhanced += ". Sophisticated camera movement enhances the narrative"
return enhanced
def enhance_lighting(prompt, analysis, scene_info):
"""Enhance lighting based on scene context"""
enhanced = prompt
# Upgrade lighting descriptions
lighting_enhancements = {
'professional lighting': 'artistic lighting design',
'cinematic lighting': 'masterful cinematic lighting',
'dramatic lighting': 'sculptural dramatic lighting',
'natural lighting': 'beautiful natural light'
}
for basic, enhanced_version in lighting_enhancements.items():
enhanced = enhanced.replace(basic, enhanced_version)
# Add lighting based on emotional tone
if scene_info:
emotional_tone = scene_info.get('emotional_tone', 'neutral')
if emotional_tone == 'dramatic':
enhanced += ". High-contrast lighting creates powerful shadows"
elif emotional_tone == 'elegant':
enhanced += ". Soft, sophisticated lighting enhances refinement"
elif emotional_tone == 'theatrical':
enhanced += ". Stage-quality lighting emphasizes performance"
# If no lighting exists, add it
if 'lighting' not in enhanced.lower() and 'light' not in enhanced.lower():
enhanced += ". Expressive lighting design supports the mood"
return enhanced
def simplify_prompt(prompt):
"""Simplify prompt to essential elements"""
# Split into main components
parts = prompt.split('.')
# Keep the main action and one enhancement
if len(parts) > 1:
simplified = parts[0] + '.'
# Add one simple enhancement
if 'camera' in prompt.lower():
simplified += " Camera follows naturally."
elif 'lighting' in prompt.lower():
simplified += " Natural lighting."
else:
simplified = prompt
return simplified
def elaborate_prompt(prompt, scene_info):
"""Add sophisticated details to the prompt"""
elaborated = prompt
# Add environmental details
if scene_info:
setting = scene_info.get('setting', 'neutral')
distinctive_elements = scene_info.get('distinctive_elements', [])
if setting == 'outdoor':
elaborated += ". Environmental elements respond subtly to the action"
elif setting == 'indoor':
elaborated += ". Interior atmosphere enhances intimate connection"
elif setting == 'performance':
elaborated += ". Stage environment supports theatrical presence"
# Add details about distinctive elements
if distinctive_elements:
element = distinctive_elements[0] if distinctive_elements else ''
if 'costume' in element:
elaborated += ". Costume textures and details visible in motion"
elif 'color' in element:
elaborated += ". Color palette enhanced through dynamic lighting"
# Add technical sophistication
elaborated += ". Multi-layered composition with depth and visual interest"
return elaborated
def apply_style_preference(prompt, analysis, scene_info):
"""Apply specific style preferences"""
styled = prompt
preference = analysis['style_preference']
if preference == 'elegant':
style_replacements = {
'dramatically': 'with refined elegance',
'boldly': 'gracefully',
'powerfully': 'with sophisticated poise',
'dramatic lighting': 'elegant lighting transitions',
'intensive': 'refined'
}
elif preference == 'powerful':
style_replacements = {
'gently': 'with commanding force',
'naturally': 'with authoritative presence',
'smoothly': 'with decisive power',
'professional lighting': 'bold, impactful lighting'
}
elif preference == 'natural':
style_replacements = {
'dramatically': 'naturally',
'theatrical': 'authentic',
'commanding': 'genuine',
'dramatic lighting': 'natural lighting'
}
if preference in ['elegant', 'powerful', 'natural']:
for original, styled_version in style_replacements.items():
styled = styled.replace(original, styled_version)
return styled
def apply_general_enhancement(prompt, analysis, scene_info):
"""Apply general enhancements based on context"""
enhanced = prompt
# Add sophistication to basic elements
if 'moves' in enhanced and 'gracefully' not in enhanced:
enhanced = enhanced.replace('moves', 'moves with purposeful grace')
if 'speaks' in enhanced and 'expressively' not in enhanced:
enhanced = enhanced.replace('speaks', 'speaks with genuine expression')
# Enhance based on scene context
if scene_info:
emotional_tone = scene_info.get('emotional_tone', 'neutral')
if emotional_tone != 'neutral' and emotional_tone not in enhanced:
enhanced += f". {emotional_tone.capitalize()} energy throughout"
return enhanced
def create_refinement_explanation(analysis, original, refined):
"""Create an explanation of what was changed"""
explanations = {
'dramatic': "I've enhanced the dramatic intensity by upgrading the actions and adding powerful lighting elements.",
'pace': f"I've adjusted the pacing to be more {'slow and deliberate' if analysis['intensity'] == 'slow' else 'energetic and quick'}.",
'camera': "I've enhanced the camera work with more sophisticated cinematography techniques.",
'lighting': "I've upgraded the lighting description to create more visual impact.",
'simplify': "I've simplified the prompt to focus on the essential action.",
'elaborate': "I've added more sophisticated details and environmental context.",
'style': f"I've adjusted the style to be more {analysis['style_preference']}."
}
base_explanation = explanations.get(analysis['request_type'], "I've enhanced the prompt based on your feedback.")
# Add language-specific response
if analysis['language'] != 'english':
language_notes = {
'spanish': "Entiendo tu sugerencia y ",
'french': "Je comprends votre suggestion et ",
'german': "Ich verstehe Ihren Vorschlag und "
}
prefix = language_notes.get(analysis['language'], "")
base_explanation = prefix + base_explanation.lower()
return base_explanation
def generate_gen4_prompts_local(scene_info, user_input=""):
"""Generate Gen-4 prompts using iterative building"""
try:
description = scene_info.get('description', '')
has_person = scene_info.get('has_person', False)
setting = scene_info.get('setting', 'neutral')
# Extract specific details for contextual prompts
specific_details = extract_specific_details(description)
subject_ref = get_contextual_subject(description, specific_details)
prompts = []
# Basic - specific to what's in the image
if has_person:
actions = get_contextual_actions(description, specific_details)
basic = f"{subject_ref} {random.choice(actions)} to camera"
else:
basic = f"The {specific_details.get('main_object', 'main element')} {random.choice(['moves', 'shifts', 'transforms'])}"
prompts.append(f"**Basic**: {basic}")
# + Subject Motion - add natural movement based on what's visible
motion_adverbs = get_contextual_adverbs(specific_details)
motion_addition = random.choice(motion_adverbs)
with_subject = f"{basic} {motion_addition}"
prompts.append(f"**+ Subject Motion**: {with_subject}")
# + Camera Motion - appropriate for the scene
camera_movements = get_contextual_camera_movement(description, specific_details)
camera_addition = random.choice(camera_movements)
with_camera = f"{with_subject}. {camera_addition}"
prompts.append(f"**+ Camera Motion**: {with_camera}")
# + Scene/Style - enhance the specific elements
if specific_details.get('colors'):
style_addition = f"{specific_details['colors']} tones enhanced by lighting. {get_contextual_atmosphere(specific_details)}"
elif setting == 'outdoor':
style_addition = "Natural lighting enhances the scene. Cinematic"
else:
style_addition = f"Professional lighting highlights {specific_details.get('distinctive_feature', 'the subject')}. Documentary style"
complete = f"{with_camera}. {style_addition}"
prompts.append(f"**+ Scene/Style**: {complete}")
return "\n\n".join(prompts)
except Exception as e:
return f"Error generating Gen-4 prompts: {str(e)}"
def build_custom_prompt_local(foundation, subject_motion, scene_motion, camera_motion, style, approach="SARA"):
"""Build custom prompt using selected approach"""
if approach == "SARA":
# SARA Structure: [Subject] [Action] while [Reference], [Atmosphere]
parts = []
if foundation:
parts.append(foundation)
# Add motion elements
motion_parts = []
if subject_motion:
motion_parts.extend(subject_motion)
if scene_motion:
motion_parts.extend(scene_motion)
if motion_parts:
parts.append(", ".join(motion_parts))
# Reference (camera stability)
if camera_motion:
parts.append(f"while {camera_motion}")
else:
parts.append("while background remains steady")
# Atmosphere
if style:
parts.append(style)
return " ".join(parts)
else: # Gen-4 style
# Gen-4 Structure: Simple iterative building
parts = []
if foundation:
parts.append(foundation)
if subject_motion:
parts.extend(subject_motion)
if camera_motion:
parts.append(camera_motion)
if scene_motion:
parts.extend(scene_motion)
if style:
parts.append(style)
return ". ".join(parts) if parts else "The subject moves naturally"
def get_smart_suggestions_local(scene_info):
"""Generate intelligent suggestions using AI-enhanced analysis"""
enhanced_description = scene_info.get('enhanced_description', '')
emotional_tone = scene_info.get('emotional_tone', 'neutral')
visual_style = scene_info.get('visual_style', 'cinematic')
distinctive_elements = scene_info.get('distinctive_elements', [])
motion_potential = scene_info.get('motion_potential', [])
setting = scene_info.get('setting', 'neutral')
if not enhanced_description:
return "Please analyze an image first to generate smart suggestions."
suggestions = []
# AI-enhanced scene understanding
subject_ref = extract_intelligent_subject_reference(scene_info)
suggestions.append(f'π€ **AI Analysis**: {enhanced_description}')
suggestions.append(f'π― **Smart Reference**: Use "{subject_ref}" for optimal clarity')
# Tone-based action suggestions
actions = generate_tone_appropriate_actions(emotional_tone, scene_info)[:3]
suggestions.append(f'π **Tone-Matched Actions**: {", ".join(actions)}')
# Motion potential insights
if motion_potential:
top_potential = motion_potential[:3]
suggestions.append(f'π¬ **Motion Opportunities**: {", ".join(top_potential)}')
# Distinctive element highlights
if distinctive_elements:
top_elements = distinctive_elements[:2]
suggestions.append(f'β¨ **Key Elements to Highlight**: {", ".join(top_elements)}')
# Visual style recommendations
style_cameras = generate_style_appropriate_cameras(visual_style, scene_info.get('cinematic_qualities', []))[:2]
suggestions.append(f'π₯ **Style-Appropriate Cameras**: {", ".join(style_cameras)}')
# Emotional tone guidance
appropriate_adverbs = [get_tone_appropriate_adverb(emotional_tone) for _ in range(3)]
suggestions.append(f'π« **Emotional Adverbs**: {", ".join(appropriate_adverbs)}')
# Setting-specific insights
if setting == 'performance':
suggestions.append('πͺ **Performance Context**: Focus on stage presence and audience engagement')
elif setting == 'outdoor':
suggestions.append('πΏ **Outdoor Setting**: Leverage natural lighting and environmental elements')
elif setting == 'indoor':
suggestions.append('π **Indoor Context**: Utilize controlled lighting and intimate framing')
# Cinematic quality suggestions
cinematic_qualities = scene_info.get('cinematic_qualities', [])
if cinematic_qualities:
top_qualities = cinematic_qualities[:2]
suggestions.append(f'π¬ **Cinematic Opportunities**: {", ".join(top_qualities)}')
# Atmosphere recommendation
atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
suggestions.append(f'π **Recommended Atmosphere**: {atmosphere}')
return "\n".join(suggestions[:10])
def generate_instant_prompts(scene_info):
"""Generate sophisticated ready-to-use prompts based on AI-enhanced analysis"""
enhanced_description = scene_info.get('enhanced_description', '')
emotional_tone = scene_info.get('emotional_tone', 'neutral')
visual_style = scene_info.get('visual_style', 'cinematic')
distinctive_elements = scene_info.get('distinctive_elements', [])
cinematic_qualities = scene_info.get('cinematic_qualities', [])
motion_potential = scene_info.get('motion_potential', [])
if not enhanced_description:
return "Please analyze an image first to generate instant prompts."
# Extract intelligent subject reference
subject_ref = extract_intelligent_subject_reference(scene_info)
# Generate tone-appropriate actions
actions = generate_tone_appropriate_actions(emotional_tone, scene_info)
# Generate style-appropriate camera work
camera_movements = generate_style_appropriate_cameras(visual_style, cinematic_qualities)
# Generate sophisticated prompts
instant_prompts = []
# === AI-POWERED SIMPLE PROMPTS ===
instant_prompts.append("π€ **AI-Powered Simple Prompts:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
instant_prompts.append(f" β’ {subject_ref} {action} {adverb}")
# === CONTEXT-AWARE SARA PROMPTS ===
instant_prompts.append("\nπ§ **Context-Aware SARA Prompts:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
camera = random.choice(camera_movements)
atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
# Include distinctive elements
if distinctive_elements and random.choice([True, False]):
distinctive = random.choice(distinctive_elements)
instant_prompts.append(f" β’ {subject_ref} {action} {adverb} while {camera}, {distinctive} enhanced, {atmosphere}")
else:
instant_prompts.append(f" β’ {subject_ref} {action} {adverb} while {camera}, {atmosphere}")
# === INTELLIGENCE-ENHANCED GEN-4 ===
instant_prompts.append("\n㪠**Intelligence-Enhanced Gen-4:**")
for i in range(3):
action = random.choice(actions)
adverb = get_tone_appropriate_adverb(emotional_tone)
camera = random.choice(camera_movements)
# Build Gen-4 iteratively with intelligence
basic = f"{subject_ref} {action}"
with_motion = f"{basic} {adverb}"
with_camera = f"{with_motion}. {camera}"
# Add intelligent style enhancement
if distinctive_elements:
distinctive = random.choice(distinctive_elements)
style_addition = f"{distinctive} highlighted by {get_lighting_for_style(visual_style)}"
else:
style_addition = f"{get_lighting_for_style(visual_style)} enhances {emotional_tone} mood"
complete = f"{with_camera}. {style_addition}"
instant_prompts.append(f" β’ {complete}")
# === SPECIALIZED INTELLIGENT PROMPTS ===
instant_prompts.append("\n⨠**Specialized AI Prompts:**")
# Motion-potential based prompts
if 'costume dynamics' in motion_potential:
instant_prompts.append(f" π **Costume Dynamics**: {subject_ref} {random.choice(actions)} while camera captures fabric textures, costume elements react to movement, theatrical lighting")
if 'facial expressions' in motion_potential:
instant_prompts.append(f" π **Expression Focus**: {subject_ref} {random.choice(['expresses emotion', 'speaks meaningfully', 'reacts naturally'])} while camera maintains intimate framing, {emotional_tone} energy emphasized")
# Cinematic quality based prompts
if 'dramatic lighting potential' in cinematic_qualities:
instant_prompts.append(f" π‘ **Dramatic Lighting**: {subject_ref} {random.choice(actions)} as lighting creates dramatic shadows, visual contrast enhances {emotional_tone} mood, cinematic depth")
if 'color enhancement opportunities' in cinematic_qualities:
colors = [elem for elem in distinctive_elements if 'coloring' in elem]
if colors:
instant_prompts.append(f" π¨ **Color Enhanced**: {subject_ref} {random.choice(actions)} while lighting dramatically enhances {colors[0]}, color grading emphasizes mood, {visual_style} aesthetic")
# Environmental integration
setting = scene_info.get('setting', 'neutral')
if setting == 'performance':
instant_prompts.append(f" πͺ **Performance Mode**: {subject_ref} {random.choice(['performs', 'presents', 'commands attention'])} while audience perspective maintained, {emotional_tone} stage presence, professional capture")
elif setting == 'outdoor':
instant_prompts.append(f" πΏ **Environmental Harmony**: {subject_ref} {random.choice(actions)} as natural elements complement motion, environmental lighting, organic {visual_style} feel")
# === ADVANCED COMPOSITE PROMPTS ===
instant_prompts.append("\nπ **Advanced AI Composite:**")
# Ultra-sophisticated prompt
advanced_action = random.choice(actions)
advanced_adverb = get_tone_appropriate_adverb(emotional_tone)
advanced_camera = random.choice(camera_movements)
advanced_atmosphere = get_style_appropriate_atmosphere(visual_style, emotional_tone)
if distinctive_elements:
advanced_distinctive = random.choice(distinctive_elements)
advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} as {advanced_camera} captures nuanced details, {advanced_distinctive} dynamically enhanced, lighting and color grading amplify {emotional_tone} undertones, {advanced_atmosphere} with {visual_style} cinematography"
else:
advanced_prompt = f"{subject_ref} {advanced_action} {advanced_adverb} while {advanced_camera} follows natural rhythm, environmental elements support the motion, {advanced_atmosphere} with intelligent {visual_style} direction"
instant_prompts.append(f" β’ {advanced_prompt}")
return "\n".join(instant_prompts)
def extract_intelligent_subject_reference(scene_info):
"""Extract intelligent subject reference using AI analysis"""
enhanced_desc = scene_info.get('enhanced_description', '')
basic_desc = scene_info.get('basic_description', '')
# Check if we have a person
has_person = scene_info.get('has_person', False)
if not has_person:
return "The subject"
# Use enhanced description for smarter reference
if isinstance(enhanced_desc, str):
enhanced_lower = enhanced_desc.lower()
if 'man in costume' in enhanced_lower:
return "The man in costume"
elif 'woman in dress' in enhanced_lower:
return "The woman in dress"
elif 'man in suit' in enhanced_lower:
return "The man in suit"
# Fallback to basic description
if isinstance(basic_desc, str):
basic_lower = basic_desc.lower()
if 'man' in basic_lower:
return "The man"
elif 'woman' in basic_lower:
return "The woman"
elif 'person' in basic_lower:
return "The person"
return "The subject"
def generate_tone_appropriate_actions(emotional_tone, scene_info):
"""Generate actions that match the emotional tone"""
base_actions = {
'dramatic': ['moves powerfully', 'gestures boldly', 'commands attention', 'strikes a pose', 'displays intensity'],
'elegant': ['moves gracefully', 'gestures refined', 'poses elegantly', 'demonstrates poise', 'flows naturally'],
'theatrical': ['performs dramatically', 'presents theatrically', 'expresses character', 'embodies role', 'captivates audience'],
'serious': ['maintains composure', 'speaks authoritatively', 'gestures formally', 'projects confidence', 'demonstrates focus'],
'cheerful': ['expresses joy', 'gestures enthusiastically', 'radiates energy', 'shows warmth', 'displays positivity'],
'professional': ['presents professionally', 'maintains bearing', 'demonstrates expertise', 'projects authority', 'engages formally'],
'neutral': ['moves naturally', 'gestures appropriately', 'maintains presence', 'expresses subtly', 'demonstrates character']
}
# Add context-specific actions based on scene elements
actions = base_actions.get(emotional_tone, base_actions['neutral']).copy()
# Add clothing-specific actions
if scene_info.get('distinctive_elements'):
for element in scene_info['distinctive_elements']:
if 'costume' in element:
actions.extend(['adjusts costume', 'displays costume details'])
elif 'cape' in element:
actions.extend(['gestures with cape', 'moves dramatically with cape'])
elif 'flag' in element:
actions.extend(['acknowledges flag', 'presents with flag'])
return actions
def generate_style_appropriate_cameras(visual_style, cinematic_qualities):
"""Generate camera movements appropriate for the visual style"""
base_cameras = {
'cinematic': ['camera glides smoothly', 'tracking shot follows', 'camera orbits elegantly', 'dolly movement captures', 'crane shot reveals'],
'dramatic': ['camera emphasizes motion', 'dynamic camera movement', 'camera captures intensity', 'bold camera work follows', 'dramatic camera angles'],
'theatrical': ['camera frames performance', 'audience perspective maintained', 'camera captures stage presence', 'performance-focused framing', 'theatrical camera work'],
'professional': ['steady camera captures', 'professional camera movement', 'controlled camera work', 'camera maintains stability', 'precise camera tracking'],
'documentary': ['handheld camera follows', 'natural camera movement', 'camera observes genuinely', 'documentary-style capture', 'authentic camera work']
}
cameras = base_cameras.get(visual_style, base_cameras['cinematic']).copy()
# Add cameras based on cinematic qualities
if 'horizontal camera movements' in cinematic_qualities:
cameras.extend(['camera pans horizontally', 'lateral camera movement'])
if 'vertical movement' in cinematic_qualities:
cameras.extend(['camera tilts vertically', 'vertical camera motion'])
if 'environmental context' in cinematic_qualities:
cameras.extend(['camera reveals environment', 'wide establishing shots'])
return cameras
def get_tone_appropriate_adverb(emotional_tone):
"""Get adverbs that match the emotional tone"""
adverbs = {
'dramatic': ['powerfully', 'intensely', 'dramatically', 'boldly', 'majestically'],
'elegant': ['gracefully', 'refinedly', 'elegantly', 'smoothly', 'sophisticatedly'],
'theatrical': ['dramatically', 'expressively', 'theatrically', 'charismatically', 'captivating'],
'serious': ['authoritatively', 'professionally', 'formally', 'confidently', 'purposefully'],
'cheerful': ['enthusiastically', 'energetically', 'warmly', 'positively', 'vibrantly'],
'professional': ['professionally', 'precisely', 'competently', 'expertly', 'authoritatively'],
'neutral': ['naturally', 'smoothly', 'appropriately', 'genuinely', 'authentically']
}
return random.choice(adverbs.get(emotional_tone, adverbs['neutral']))
def get_style_appropriate_atmosphere(visual_style, emotional_tone):
"""Get atmosphere that combines style and tone"""
style_atmospheres = {
'cinematic': f'cinematic {emotional_tone} atmosphere',
'dramatic': f'dramatic {emotional_tone} mood',
'theatrical': f'theatrical {emotional_tone} presence',
'professional': f'professional {emotional_tone} environment',
'documentary': f'authentic {emotional_tone} feeling'
}
return style_atmospheres.get(visual_style, f'{visual_style} {emotional_tone} atmosphere')
def get_lighting_for_style(visual_style):
"""Get appropriate lighting description for visual style"""
lighting = {
'cinematic': 'cinematic lighting',
'dramatic': 'dramatic lighting',
'theatrical': 'stage lighting',
'professional': 'professional lighting',
'documentary': 'natural lighting'
}
return lighting.get(visual_style, 'cinematic lighting')
# Gen-4 style prompts
for i in range(3):
action = random.choice(contextual_actions)
adverb = random.choice(contextual_adverbs)
camera = random.choice(camera_moves)
# Build Gen-4 iteratively
basic = f"{subject_ref} {action}"
with_motion = f"{basic} {adverb}"
with_camera = f"{with_motion}. {camera}"
# Add style based on specific details
if specific_details.get('colors'):
style_addition = f"{specific_details['colors']} tones enhanced by lighting"
else:
style_addition = "Cinematic lighting"
complete = f"{with_camera}. {style_addition}"
instant_prompts.append(f"π **Gen-4**: {complete}")
# Specialized prompts based on distinctive features
if specific_details.get('clothing'):
clothing = specific_details['clothing']
if 'cape' in clothing:
instant_prompts.append(f"π¦Έ **Cape Focus**: {subject_ref} moves dramatically while camera captures cape movement, wind effects enhance cape flow, heroic atmosphere")
if 'dress' in clothing:
instant_prompts.append(f"π **Dress Focus**: {subject_ref} moves gracefully while camera tracks smoothly, fabric reacts to movement, elegant atmosphere")
if 'hat' in clothing:
instant_prompts.append(f"π© **Hat Focus**: {subject_ref} tips hat confidently while camera frames from chest up, professional lighting")
# Color-focused prompts
if specific_details.get('colors'):
colors = specific_details['colors']
instant_prompts.append(f"π¨ **Color Enhanced**: {subject_ref} {random.choice(contextual_actions)} while lighting dramatically enhances {colors} tones, cinematic depth")
return "\n\n".join(instant_prompts)
def copy_to_foundation(prompt_text, approach):
"""Extract the main prompt from formatted text for foundation field"""
# Remove the emoji and label prefix to get clean prompt
if "**" in prompt_text:
# Extract text after the **:
parts = prompt_text.split("**: ", 1)
if len(parts) > 1:
return parts[1]
return prompt_text
# Create optimized Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Universal Video Prompting Tool") as demo:
gr.Markdown("# π¬ Universal Video Prompting Tool")
gr.Markdown("*Compatible with Gen-4, Sora, Pika, Luma, Runway & all AI video models*")
gr.Markdown("**Combines official Gen-4 guidelines with advanced SARA Framework**")
with gr.Tabs():
with gr.TabItem("π Prompting Guide"):
gr.Markdown(unified_instructions)
with gr.TabItem("π¬ Quick Video Prompt Generator"):
with gr.Row():
with gr.Column(scale=1):
# Image upload and analysis
gr.Markdown("## π· Upload Your Frame 0")
image_input = gr.Image(type="pil", label="Upload your initial frame")
analyze_btn = gr.Button("π Analyze Image (Fast)", variant="primary")
image_analysis = gr.Textbox(
label="Image Analysis Results",
placeholder="Upload an image and click 'Analyze Image' for instant analysis...",
lines=10,
interactive=False
)
# Hidden state for scene info
scene_info_state = gr.State({})
# Quick suggestions
with gr.Group():
gr.Markdown("### π‘ Smart Suggestions")
get_suggestions_btn = gr.Button("Get Smart Tips", variant="secondary")
smart_suggestions = gr.Textbox(
label="Context-Aware Suggestions",
placeholder="Click 'Get Smart Tips' after image analysis...",
lines=5,
interactive=False
)
# Instant prompts - NEW SECTION
with gr.Group():
gr.Markdown("### π Ready-to-Use Prompts")
generate_instant_btn = gr.Button("Generate Instant Prompts", variant="primary")
instant_prompts = gr.Textbox(
label="Copy & Paste Ready Prompts",
placeholder="Click 'Generate Instant Prompts' to get ready-to-use prompts based on your image...",
lines=12,
interactive=True,
show_copy_button=True
)
with gr.Column(scale=1):
# Prompt generation methods
gr.Markdown("## π Choose Your Method")
with gr.Tabs():
with gr.TabItem("π€ AI Prompt Assistant"):
gr.Markdown("*Describe your idea in any language - AI will create optimized English video prompts*")
with gr.Row():
with gr.Column(scale=2):
user_idea = gr.Textbox(
label="Your Idea (any language)",
placeholder="e.g., 'el personaje se quita la nariz de payaso' or 'character walks slowly towards camera'",
lines=3
)
with gr.Column(scale=1):
optimize_btn = gr.Button("π Optimize & Structure", variant="primary")
ai_optimized = gr.Textbox(
label="AI-Optimized Video Prompt",
placeholder="Your optimized prompt will appear here...",
lines=4,
interactive=True,
show_copy_button=True
)
# Chat interface for refinement
gr.Markdown("### π¬ Refine Your Prompt")
chat_history = gr.Chatbot(
label="Prompt Refinement Chat",
height=250,
placeholder="Chat history will appear here as you refine your prompt..."
)
with gr.Row():
refine_input = gr.Textbox(
label="Refine further",
placeholder="e.g., 'make it more dramatic' or 'add camera movement' or 'mΓ‘s lento'",
scale=3
)
refine_btn = gr.Button("π¬ Refine", scale=1)
with gr.TabItem("π Gen-4 Official"):
gr.Markdown("*Official method: Simple β Complex building*")
foundation_gen4 = gr.Textbox(
label="Foundation (Optional)",
placeholder="e.g., 'The subject walks forward'",
lines=1
)
generate_gen4_btn = gr.Button("Generate Gen-4 Prompts", variant="primary")
gen4_output = gr.Textbox(
label="Gen-4 Style Prompts",
lines=8,
interactive=False
)
# Custom prompt builder
with gr.Group():
gr.Markdown("## π οΈ Custom Prompt Builder")
with gr.Row():
approach_selector = gr.Radio(
choices=["SARA", "Gen-4"],
value="SARA",
label="Approach",
interactive=True
)
custom_foundation = gr.Textbox(
label="Foundation",
placeholder="The subject...",
lines=1
)
with gr.Row():
subject_motion = gr.CheckboxGroup(
choices=["walks smoothly", "speaks clearly", "gestures naturally", "moves gracefully", "turns slowly"],
label="Subject Motion"
)
scene_motion = gr.CheckboxGroup(
choices=["dust swirls", "lighting changes", "wind effects", "water movement", "atmosphere shifts"],
label="Scene Motion"
)
with gr.Row():
camera_motion = gr.Dropdown(
choices=["camera remains steady", "handheld camera", "camera pans left", "camera pans right", "camera tracks forward", "camera zooms in"],
label="Camera Motion",
value="camera remains steady"
)
style_motion = gr.Dropdown(
choices=["cinematic", "documentary style", "live-action", "dramatic", "peaceful", "energetic", "professional"],
label="Style/Atmosphere",
value="cinematic"
)
build_custom_btn = gr.Button("π¨ Build Custom Prompt", variant="secondary")
custom_output = gr.Textbox(
label="Your Custom Prompt",
lines=3,
interactive=True
)
# Event handlers
analyze_btn.click(
fn=analyze_image_simple,
inputs=[image_input],
outputs=[image_analysis, gr.State(), scene_info_state]
)
get_suggestions_btn.click(
fn=get_smart_suggestions_local,
inputs=[scene_info_state],
outputs=[smart_suggestions]
)
# NEW: Generate instant prompts
generate_instant_btn.click(
fn=generate_instant_prompts,
inputs=[scene_info_state],
outputs=[instant_prompts]
)
# NEW: AI Prompt Assistant
optimize_btn.click(
fn=optimize_user_prompt,
inputs=[user_idea, scene_info_state],
outputs=[ai_optimized]
)
refine_btn.click(
fn=refine_prompt_with_feedback,
inputs=[ai_optimized, refine_input, chat_history, scene_info_state],
outputs=[ai_optimized, chat_history]
)
generate_gen4_btn.click(
fn=generate_gen4_prompts_local,
inputs=[scene_info_state, foundation_gen4],
outputs=[gen4_output]
)
build_custom_btn.click(
fn=build_custom_prompt_local,
inputs=[custom_foundation, subject_motion, scene_motion, camera_motion, style_motion, approach_selector],
outputs=[custom_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |