{
  "leaderboardData": [
    {
      "info": {
        "name": "Step-Audio-2",
        "size": "-",
        "type": "closed",
        "link": "https://arxiv.org/abs/2507.16632"
      },
      "Sound": {
        "Test-mini": "84.04",
        "Test": "80.60"
      },
      "Music": {
        "Test-mini": "73.56",
        "Test": "68.23"
      },
      "Speech": {
        "Test-mini": "75.15",
        "Test": "72.75"
      },
      "Avg": {
        "Test-mini": "77.58",
        "Test": "73.86"
      }
    },
    {
      "info": {
        "name": "Audio Flamingo 3",
        "size": "8.2B",
        "type": "open_source",
        "link": "https://www.arxiv.org/abs/2507.08128"
      },
      "Sound": {
        "Test-mini": "79.58",
        "Test": "75.83"
      },
      "Music": {
        "Test-mini": "73.95",
        "Test": "74.47"
      },
      "Speech": {
        "Test-mini": "66.37",
        "Test": "66.97"
      },
      "Avg": {
        "Test-mini": "73.30",
        "Test": "72.42"
      }
    },
    {
      "info": {
        "name": "Step-Audio-2-mini",
        "size": "8.3B",
        "type": "open_access",
        "link": "https://arxiv.org/abs/2507.16632"
      },
      "Sound": {
        "Test-mini": "79.30",
        "Test": "75.57"
      },
      "Music": {
        "Test-mini": "68.44",
        "Test": "66.85"
      },
      "Speech": {
        "Test-mini": "68.16",
        "Test": "66.49"
      },
      "Avg": {
        "Test-mini": "72.73",
        "Test": "70.23"
      }
    },
    {
      "info": {
        "name": "DeSTA2.5-Audio",
        "size": "8B",
        "type": "open_source",
        "link": "https://arxiv.org/abs/2507.02768"
      },
      "Sound": {
        "Test-mini": "70.27",
        "Test": "66.83"
      },
      "Music": {
        "Test-mini": "56.29",
        "Test": "57.10"
      },
      "Speech": {
        "Test-mini": "71.47",
        "Test": "71.94"
      },
      "Avg": {
        "Test-mini": "66.00",
        "Test": "65.21"
      }
    },
    {
      "info": {
        "name": "Kimi-Audio",
        "size": "8.2B",
        "type": "open_access",
        "link": "https://arxiv.org/abs/2504.18425"
      },
      "Sound": {
        "Test-mini": "75.68",
        "Test": "70.70"
      },
      "Music": {
        "Test-mini": "66.77",
        "Test": "65.93"
      },
      "Speech": {
        "Test-mini": "62.16",
        "Test": "56.57"
      },
      "Avg": {
        "Test-mini": "68.20",
        "Test": "64.40"
      }
    },
    {
      "info": {
        "name": "Qwen2-Audio-Instruct",
        "size": "7B",
        "type": "open_access",
        "link": "https://arxiv.org/abs/2407.10759"
      },
      "Sound": {
        "Test-mini": "67.27",
        "Test": "61.17"
      },
      "Music": {
        "Test-mini": "56.29",
        "Test": "55.67"
      },
      "Speech": {
        "Test-mini": "55.26",
        "Test": "55.37"
      },
      "Avg": {
        "Test-mini": "59.60",
        "Test": "57.40"
      }
    },
    {
      "info": {
        "name": "Phi-4-multimodal",
        "size": "5.5B",
        "type": "open_access",
        "link": "https://huggingface.co/microsoft/phi-4"
      },
      "Sound": {
        "Test-mini": "65.47",
        "Test": "62.67"
      },
      "Music": {
        "Test-mini": "64.37",
        "Test": "61.97"
      },
      "Speech": {
        "Test-mini": "67.27",
        "Test": "63.80"
      },
      "Avg": {
        "Test-mini": "65.70",
        "Test": "62.81"
      }
    },
    {
      "info": {
        "name": "Audio Reasoner",
        "size": "8.2B",
        "type": "open_access",
        "link": "https://huggingface.co/zhifeixie/Audio-Reasoner/tree/main"
      },
      "Sound": {
        "Test-mini": "67.87",
        "Test": "67.27"
      },
      "Music": {
        "Test-mini": "69.16",
        "Test": "61.53"
      },
      "Speech": {
        "Test-mini": "66.07",
        "Test": "62.53"
      },
      "Avg": {
        "Test-mini": "67.70",
        "Test": "63.78"
      }
    },
    {
      "info": {
        "name": "Audio Flamingo 2",
        "size": "3B",
        "type": "open_source",
        "link": "https://huggingface.co/nvidia/audio-flamingo"
      },
      "Sound": {
        "Test-mini": "71.47",
        "Test": "68.13"
      },
      "Music": {
        "Test-mini": "70.96",
        "Test": "70.20"
      },
      "Speech": {
        "Test-mini": "44.74",
        "Test": "44.87"
      },
      "Avg": {
        "Test-mini": "62.40",
        "Test": "61.06"
      }
    },
    {
      "info": {
        "name": "Audio Flamingo Chat",
        "size": "1B",
        "type": "open_source",
        "link": "https://huggingface.co/nvidia/audio-flamingo"
      },
      "Sound": {
        "Test-mini": "25.23",
        "Test": "23.33"
      },
      "Music": {
        "Test-mini": "17.66",
        "Test": "15.77"
      },
      "Speech": {
        "Test-mini": "6.91",
        "Test": "7.67"
      },
      "Avg": {
        "Test-mini": "16.60",
        "Test": "15.59"
      }
    },
    {
      "info": {
        "name": "M2UGen",
        "size": "7B",
        "type": "open_source",
        "link": "https://huggingface.co/M2UGen/M2UGen-MusicGen-medium"
      },
      "Sound": {
        "Test-mini": "43.24",
        "Test": "44.97"
      },
      "Music": {
        "Test-mini": "37.13",
        "Test": "38.53"
      },
      "Speech": {
        "Test-mini": "33.33",
        "Test": "35.77"
      },
      "Avg": {
        "Test-mini": "37.90",
        "Test": "39.76"
      }
    },
    {
      "info": {
        "name": "LTU",
        "size": "7B",
        "type": "open_source",
        "link": "https://openreview.net/pdf?id=nBZBPXdJlC"
      },
      "Sound": {
        "Test-mini": "20.42",
        "Test": "20.67"
      },
      "Music": {
        "Test-mini": "15.97",
        "Test": "15.68"
      },
      "Speech": {
        "Test-mini": "15.92",
        "Test": "15.33"
      },
      "Avg": {
        "Test-mini": "17.44",
        "Test": "17.23"
      }
    },
    {
      "info": {
        "name": "SALMONN",
        "size": "13B",
        "type": "open_source",
        "link": "https://arxiv.org/pdf/2310.13289"
      },
      "Sound": {
        "Test-mini": "41.14",
        "Test": "42.10"
      },
      "Music": {
        "Test-mini": "37.13",
        "Test": "37.83"
      },
      "Speech": {
        "Test-mini": "26.43",
        "Test": "28.77"
      },
      "Avg": {
        "Test-mini": "34.90",
        "Test": "36.23"
      }
    },
    {
      "info": {
        "name": "MusiLingo",
        "size": "7B",
        "type": "open_source",
        "link": "https://huggingface.co/m-a-p/MusiLingo-long-v1"
      },
      "Sound": {
        "Test-mini": "43.24",
        "Test": "41.93"
      },
      "Music": {
        "Test-mini": "40.12",
        "Test": "41.23"
      },
      "Speech": {
        "Test-mini": "31.23",
        "Test": "31.70"
      },
      "Avg": {
        "Test-mini": "38.20",
        "Test": "38.29"
      }
    },
    {
      "info": {
        "name": "MuLLaMa",
        "size": "7B",
        "type": "open_source",
        "link": "https://arxiv.org/pdf/2308.11276"
      },
      "Sound": {
        "Test-mini": "33.03",
        "Test": "30.97"
      },
      "Music": {
        "Test-mini": "32.34",
        "Test": "29.67"
      },
      "Speech": {
        "Test-mini": "17.42",
        "Test": "17.10"
      },
      "Avg": {
        "Test-mini": "27.60",
        "Test": "25.91"
      }
    },
    {
      "info": {
        "name": "GAMA",
        "size": "7B",
        "type": "open_source",
        "link": "https://huggingface.co/spaces/sonalkum/GAMA"
      },
      "Sound": {
        "Test-mini": "31.83",
        "Test": "30.73"
      },
      "Music": {
        "Test-mini": "17.71",
        "Test": "17.33"
      },
      "Speech": {
        "Test-mini": "12.91",
        "Test": "16.97"
      },
      "Avg": {
        "Test-mini": "20.82",
        "Test": "21.68"
      }
    },
    {
      "info": {
        "name": "GAMA-IT",
        "size": "7B",
        "type": "open_source",
        "link": "https://huggingface.co/spaces/sonalkum/GAMA-IT"
      },
      "Sound": {
        "Test-mini": "30.93",
        "Test": "32.73"
      },
      "Music": {
        "Test-mini": "26.74",
        "Test": "22.37"
      },
      "Speech": {
        "Test-mini": "10.81",
        "Test": "11.57"
      },
      "Avg": {
        "Test-mini": "22.83",
        "Test": "22.22"
      }
    },
    {
      "info": {
        "name": "Qwen2.5-Omni",
        "size": "8.2B",
        "type": "open_access",
        "link": "https://arxiv.org/abs/2503.20215"
      },
      "Sound": {
        "Test-mini": "78.10",
        "Test": "76.77"
      },
      "Music": {
        "Test-mini": "65.90",
        "Test": "67.33"
      },
      "Speech": {
        "Test-mini": "70.60",
        "Test": "68.90"
      },
      "Avg": {
        "Test-mini": "71.50",
        "Test": "71.00"
      }
    },
    {
      "info": {
        "name": "Gemini 2.5 Flash",
        "size": "-",
        "type": "proprietary",
        "link": "https://ai.google.dev/gemini-api/docs/models"
      },
      "Sound": {
        "Test-mini": "73.27",
        "Test": "69.50"
      },
      "Music": {
        "Test-mini": "65.57",
        "Test": "69.40"
      },
      "Speech": {
        "Test-mini": "76.58",
        "Test": "68.27"
      },
      "Avg": {
        "Test-mini": "71.80",
        "Test": "67.39"
      }
    },
    {
      "info": {
        "name": "Gemini 2.0 Flash",
        "size": "-",
        "type": "proprietary",
        "link": "https://ai.google.dev/gemini-api/docs/models"
      },
      "Sound": {
        "Test-mini": "71.17",
        "Test": "68.93"
      },
      "Music": {
        "Test-mini": "65.27",
        "Test": "59.30"
      },
      "Speech": {
        "Test-mini": "75.08",
        "Test": "72.87"
      },
      "Avg": {
        "Test-mini": "70.50",
        "Test": "67.03"
      }
    },
    {
      "info": {
        "name": "Gemini 2.5 Flash Lite",
        "size": "-",
        "type": "proprietary",
        "link": "https://ai.google.dev/gemini-api/docs/models"
      },
      "Sound": {
        "Test-mini": "63.06",
        "Test": "62.50"
      },
      "Music": {
        "Test-mini": "63.47",
        "Test": "54.87"
      },
      "Speech": {
        "Test-mini": "72.07",
        "Test": "67.47"
      },
      "Avg": {
        "Test-mini": "66.20",
        "Test": "61.61"
      }
    },
    {
      "info": {
        "name": "Gemini 2.5 Pro",
        "size": "-",
        "type": "proprietary",
        "link": "https://ai.google.dev/gemini-api/docs/models"
      },
      "Sound": {
        "Test-mini": "75.08",
        "Test": "70.63"
      },
      "Music": {
        "Test-mini": "68.26",
        "Test": "64.77"
      },
      "Speech": {
        "Test-mini": "71.47",
        "Test": "72.67"
      },
      "Avg": {
        "Test-mini": "71.60",
        "Test": "69.36"
      }
    },
    {
      "info": {
        "name": "GPT-4o mini Audio",
        "size": "-",
        "type": "proprietary",
        "link": "https://arxiv.org/abs/2410.21276"
      },
      "Sound": {
        "Test-mini": "50.75",
        "Test": "49.67"
      },
      "Music": {
        "Test-mini": "39.22",
        "Test": "35.97"
      },
      "Speech": {
        "Test-mini": "69.07",
        "Test": "67.47"
      },
      "Avg": {
        "Test-mini": "53.00",
        "Test": "51.03"
      }
    },
    {
      "info": {
        "name": "GPT-4o Audio",
        "size": "-",
        "type": "proprietary",
        "link": "https://arxiv.org/abs/2410.21276"
      },
      "Sound": {
        "Test-mini": "64.56",
        "Test": "63.20"
      },
      "Music": {
        "Test-mini": "56.29",
        "Test": "49.93"
      },
      "Speech": {
        "Test-mini": "66.67",
        "Test": "69.33"
      },
      "Avg": {
        "Test-mini": "62.50",
        "Test": "60.82"
      }
    },
    {
      "info": {
        "name": "Gemma 3n",
        "size": "2B",
        "type": "open_access",
        "link": "https://deepmind.google/models/gemma/gemma-3n/"
      },
      "Sound": {
        "Test-mini": "51.35",
        "Test": "47.47"
      },
      "Music": {
        "Test-mini": "52.10",
        "Test": "51.63"
      },
      "Speech": {
        "Test-mini": "52.22",
        "Test": "57.07"
      },
      "Avg": {
        "Test-mini": "51.69",
        "Test": "52.06"
      }
    },
    {
      "info": {
        "name": "Gemma 3n",
        "size": "4B",
        "type": "open_access",
        "link": "https://deepmind.google/models/gemma/gemma-3n/"
      },
      "Sound": {
        "Test-mini": "55.86",
        "Test": "50.27"
      },
      "Music": {
        "Test-mini": "56.89",
        "Test": "53.20"
      },
      "Speech": {
        "Test-mini": "61.26",
        "Test": "62.13"
      },
      "Avg": {
        "Test-mini": "58.00",
        "Test": "55.20"
      }
    },
    {
      "info": {
        "name": "Audio-Thinker",
        "size": "8.4B",
        "type": "closed",
        "link": "https://arxiv.org/pdf/2508.08039v1"
      },
      "Sound": {
        "Test-mini": "81.98",
        "Test": "78.8"
      },
      "Music": {
        "Test-mini": "74.25",
        "Test": "73.8"
      },
      "Speech": {
        "Test-mini": "76.88",
        "Test": "75.16"
      },
      "Avg": {
        "Test-mini": "77.7",
        "Test": "75.98"
      }
    },
    {
      "info": {
        "name": "MiMo-Audio",
        "size": "7B",
        "type": "open",
        "link": "https://github.com/XiaomiMiMo/MiMo-Audio/"
      },
      "Sound": {
        "Test-mini": "81.68",
        "Test": "77.2"
      },
      "Music": {
        "Test-mini": "74.25",
        "Test": "69.73"
      },
      "Speech": {
        "Test-mini": "68.17",
        "Test": "70.77"
      },
      "Avg": {
        "Test-mini": "74.7",
        "Test": "72.59"
      }
    },
    {
      "info": {
        "name": "Nova 2 Omni",
        "size": "-",
        "type": "closed",
        "link": "https://www.aboutamazon.com/news/aws/aws-agentic-ai-amazon-bedrock-nova-models"
      },
      "Sound": {
        "Test-mini": "81.08",
        "Test": "77.87"
      },
      "Music": {
        "Test-mini": "70.36",
        "Test": "66.37"
      },
      "Speech": {
        "Test-mini": "81.98",
        "Test": "81.82"
      },
      "Avg": {
        "Test-mini": "77.8",
        "Test": "75.28"
      }
    }
  ]
}
