[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"tag-vllm":3},{"tag":4,"articles":11},{"id":5,"name":6,"slug":7,"article_count":8,"description_zh":9,"description_en":10},"6acb2d1f-934e-4e31-a9d1-8e4392fb099a","vLLM","vllm",6,"vLLM 是面向大型語言模型的高吞吐推理引擎，重點在 PagedAttention、KV cache 管理與連續批次處理，讓 GPU 更有效率地服務聊天、RAG、批次生成與多模型部署。","vLLM is a high-throughput inference engine for large language models, built around PagedAttention, KV cache management, and continuous batching. It matters for chat services, RAG pipelines, batch generation, and multi-model GPU deployment.",[12,21,29,36,43,50,57],{"id":13,"slug":14,"title":15,"summary":16,"category":17,"image_url":18,"cover_image":18,"language":19,"created_at":20},"381fb6c6-6da7-4444-831f-8c5eed8d685c","turboquant-vllm-comparison-fp8-kv-cache-zh","TurboQuant 與 FP8 實測結果","vLLM 首次大規模比較 TurboQuant 與 FP8 KV-cache。結果很直白：FP8 在速度上更穩，TurboQuant 的高壓縮版本則常掉準確率。","research","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778839867551-4v9g.png","zh","2026-05-15T10:10:36.034569+00:00",{"id":22,"slug":23,"title":24,"summary":25,"category":26,"image_url":27,"cover_image":27,"language":19,"created_at":28},"fe630502-5455-4001-a6bf-0643f9eb469d","gemma-4-assistant-models-faster-draft-tokens-zh","Gemma 4 助手模型加速草稿 Token","Gemma 4 的 E2B 與 E4B 助手模型用 centroid masking，把草稿 token 的 lm_head 計算量砍到約 45 倍，且品質損失很小。","tools","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1778278246167-hskc.png","2026-05-08T22:10:33.309766+00:00",{"id":30,"slug":31,"title":32,"summary":33,"category":26,"image_url":34,"cover_image":34,"language":19,"created_at":35},"feb9176d-89c6-4bd0-a82a-8440625d8c94","awesome-open-source-ai-projects-list-zh","開源 AI 專案清單怎麼挑","這份 GitHub 清單收錄可直接上線的開源 AI 專案，從 PyTorch 到 vLLM 都有，2,486 顆星，適合想找模型、推理、RAG 和代理工具的工程師。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775999036470-b4zr.png","2026-04-12T13:03:35.795784+00:00",{"id":37,"slug":38,"title":39,"summary":40,"category":26,"image_url":41,"cover_image":41,"language":19,"created_at":42},"868034d7-415b-49bd-8f25-4dbd602e7094","unsloth-qwen35-partial-fine-tuning-zh","Unsloth 讓 Qwen3.5 可分層微調","Unsloth 新增 Qwen3.5 視覺模型分層微調，能只訓練 vision、language、attention 或 MLP。VRAM 更省，訓練也更快，對多模態團隊很實用。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775218014686-wj6q.png","2026-04-03T12:06:38.523525+00:00",{"id":44,"slug":45,"title":46,"summary":47,"category":17,"image_url":48,"cover_image":48,"language":19,"created_at":49},"fdb08bdf-a3bd-4c4d-acaf-ce8035f24449","turboquant-google-paper-explained-zh","TurboQuant 是什麼？Google 新論文重點","Google 的 TurboQuant 盯上 LLM 的 KV cache 瓶頸，用低位元量化降低記憶體用量與推論成本。這篇帶你看它在解什麼問題、和其他優化法差在哪。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775160957331-6iua.png","2026-04-02T20:15:40.07166+00:00",{"id":51,"slug":52,"title":53,"summary":54,"category":26,"image_url":55,"cover_image":55,"language":19,"created_at":56},"d233c90c-e7d8-418d-a8dc-f76080f1b968","turboquant-fast-cold-starts-rust-gpu-zh","TurboQuant、冷啟動與 GPU Rust","TurboQuant 把 KV cache 壓到 4.6 倍，GPU state restore 盯上 32B 模型冷啟動，Rust 也更深入 CUDA 開發。","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775146380823-5d5u.png","2026-04-02T16:12:38.23896+00:00",{"id":58,"slug":59,"title":60,"summary":61,"category":62,"image_url":63,"cover_image":63,"language":19,"created_at":64},"d9fda242-d695-4ea4-a0e0-c6c64ad72965","nvidia-sets-new-mlperf-inference-records-zh","NVIDIA 再刷 MLPerf 推論紀錄","NVIDIA 在 MLPerf Inference v6.0 再交出新成績，GB300 NVL72 對 DeepSeek-R1 伺服器推論提升 2.7x，Llama 3.1 405B 也提升 1.5x。","industry","https:\u002F\u002Fxxdpdyhzhpamafnrdkyq.supabase.co\u002Fstorage\u002Fv1\u002Fobject\u002Fpublic\u002Fcovers\u002Finline-1775122496881-vxz0.png","2026-04-02T08:48:38.43437+00:00"]