[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-benchmarking":3},{"tag":4,"articles":9},{"id":5,"name":6,"slug":6,"article_count":7,"description_zh":8,"description_en":8},"b3e6fc89-e26d-480d-ac7f-7a8581f60fe5","benchmarking",2,null,[10,19,26,33],{"id":11,"slug":12,"title":13,"summary":14,"category":15,"image_url":16,"cover_image":16,"language":17,"created_at":18},"13519d21-7023-407c-8974-7c633ebede9f","why-open-source-llms-should-be-judged-by-workload-not-hype-en","Why Open-Source LLMs Must Be Judged by Workload, Not Hype","Open-source LLMs in 2026 should be chosen by workload fit, not benchmark hype.","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778095240831-b3wm.png","en","2026-05-06T19:20:22.800006+00:00",{"id":20,"slug":21,"title":22,"summary":23,"category":15,"image_url":24,"cover_image":24,"language":17,"created_at":25},"7ac3d870-d844-4d95-a287-81b22dfa9eca","deeptest-2026-llm-car-manual-assistant-en","DeepTest 2026 benchmarks an LLM car manual assistant","DeepTest’s first LLM testing competition compared four tools on car manual retrieval, showing how to benchmark automotive assistants.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778048468789-e7sx.png","2026-05-06T06:20:33.071908+00:00",{"id":27,"slug":28,"title":29,"summary":30,"category":15,"image_url":31,"cover_image":31,"language":17,"created_at":32},"2a6b0902-8cf2-42c9-9b38-59e6ed0294c9","speechparaling-bench-paralinguistic-speech-generation-en","SpeechParaling-Bench tests speech models on nuance","A new benchmark expands paralinguistic speech evaluation past coarse labels, using 1,000+ queries and pairwise judging to expose model gaps.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1776924234257-ns8c.png","2026-04-23T06:03:39.315548+00:00",{"id":34,"slug":35,"title":36,"summary":37,"category":15,"image_url":38,"cover_image":38,"language":17,"created_at":39},"be5dca83-11ca-4d7b-b1b8-ec3eb4005a8c","hippocamp-benchmarks-contextual-agents-personal-computers-en","HippoCamp tests agents on your personal files","HippoCamp benchmarks multimodal agents on dense personal file systems, exposing weak retrieval, grounding, and cross-modal reasoning.","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775115017182-8a6y.png","2026-04-02T06:03:26.745712+00:00"]