GLM-5.2 OpenAI-Compatible API: A Hands-On Guide to Reasoning Effort, Function Calling, and Long-Context Retrieval
import sys, subprocess
subprocess.run([sys.executable, “-m”, “pip”, “install”, “-q”, “-U”, “openai”], check=False)
import os, re, json, time, getpass
from openai import OpenAI
PROVIDERS = {
“zai”: {“base_url”: “https://api.z.ai/api/paas/v4/”, “model”: “glm-5.2”, “env”: “ZAI_API_KEY”},
“openrouter”: {“base_url”: “https://openrouter.ai/api/v1”, “model”: “z-ai/glm-5.2”, “env”: “OPENROUTER_API_KEY”},
“together”: {“base_url”: “https://api.together.xyz/v1”, “model”: “zai-org/GLM-5.2″,”env”: “TOGETHER_API_KEY”},
“requesty”: {“base_url”: “https://router.requesty.ai/v1”, “model”: “zai/glm-5.2”, “env”: “REQUESTY_API_KEY”},
“huggingface”: {“base_url”: “https://router.huggingface.co/v1″,”model”: “zai-org/GLM-5.2″,”env”: “HF_TOKEN”},
}
PROVIDER = “zai”
CFG = PROVIDERS[PROVIDER]
MODEL = CFG[“model”]
def load_api_key(env_name):
try:
from google.colab import userdata
v = userdata.get(env_name)
if v: return v
except Exception:
pass
if os.environ.get(env_name):
return os.environ[env_name]
return getpass.getpass(f”Enter your {env_name}: “)
client = OpenAI(api_key=load_api_key(CFG[“env”]), base_url=CFG[“base_url”])
PRICE_IN_PER_M, PRICE_OUT_PER_M = 1.40, 4.40
_USAGE = {“in”: 0, “out”: 0, “calls”: 0}
def _track(usage):
if usage:
_USAGE[“in”] += getattr(usage, “prompt_tokens”, 0) or 0
_USAGE[“out”] += getattr(usage, “completion_tokens”, 0) or 0
_USAGE[“calls”] += 1
def get_reasoning(obj):
“””Pull GLM’s hidden reasoning trace from a message/delta (a provider-extra field).”””
val = getattr(obj, “reasoning_content”, None)
if val: return val
extra = getattr(obj, “model_extra”, None) or {}
if extra.get(“reasoning_content”): return extra[“reasoning_content”]
try: return obj.to_dict().get(“reasoning_content”)
except Exception: return None
def chat(messages, effort=None, thinking=True, tools=None, tool_choice=”auto”,
stream=False, max_tokens=2048, temperature=1.0, tool_stream=False):
“””
effort: None | “high” | “max” (GLM-5.2 thinking-effort level; max is the model default)
thinking: True -> deep thinking on; False -> off (fast, cheap, low-latency)
GLM-specific params go through extra_body so any OpenAI client works.
“””
extra = {“thinking”: {“type”: “enabled” if thinking else “disabled”}}
if effort and thinking: extra[“reasoning_effort”] = effort
if tool_stream: extra[“tool_stream”] = True
kwargs = dict(model=MODEL, messages=messages, max_tokens=max_tokens,
temperature=temperature, stream=stream, extra_body=extra)
if tools:
kwargs.update(tools=tools, tool_choice=tool_choice)
if stream:
kwargs[“stream_options”] = {“include_usage”: True}
return client.chat.completions.create(**kwargs)
subprocess.run([sys.executable, “-m”, “pip”, “install”, “-q”, “-U”, “openai”], check=False)
import os, re, json, time, getpass
from openai import OpenAI
PROVIDERS = {
“zai”: {“base_url”: “https://api.z.ai/api/paas/v4/”, “model”: “glm-5.2”, “env”: “ZAI_API_KEY”},
“openrouter”: {“base_url”: “https://openrouter.ai/api/v1”, “model”: “z-ai/glm-5.2”, “env”: “OPENROUTER_API_KEY”},
“together”: {“base_url”: “https://api.together.xyz/v1”, “model”: “zai-org/GLM-5.2″,”env”: “TOGETHER_API_KEY”},
“requesty”: {“base_url”: “https://router.requesty.ai/v1”, “model”: “zai/glm-5.2”, “env”: “REQUESTY_API_KEY”},
“huggingface”: {“base_url”: “https://router.huggingface.co/v1″,”model”: “zai-org/GLM-5.2″,”env”: “HF_TOKEN”},
}
PROVIDER = “zai”
CFG = PROVIDERS[PROVIDER]
MODEL = CFG[“model”]
def load_api_key(env_name):
try:
from google.colab import userdata
v = userdata.get(env_name)
if v: return v
except Exception:
pass
if os.environ.get(env_name):
return os.environ[env_name]
return getpass.getpass(f”Enter your {env_name}: “)
client = OpenAI(api_key=load_api_key(CFG[“env”]), base_url=CFG[“base_url”])
PRICE_IN_PER_M, PRICE_OUT_PER_M = 1.40, 4.40
_USAGE = {“in”: 0, “out”: 0, “calls”: 0}
def _track(usage):
if usage:
_USAGE[“in”] += getattr(usage, “prompt_tokens”, 0) or 0
_USAGE[“out”] += getattr(usage, “completion_tokens”, 0) or 0
_USAGE[“calls”] += 1
def get_reasoning(obj):
“””Pull GLM’s hidden reasoning trace from a message/delta (a provider-extra field).”””
val = getattr(obj, “reasoning_content”, None)
if val: return val
extra = getattr(obj, “model_extra”, None) or {}
if extra.get(“reasoning_content”): return extra[“reasoning_content”]
try: return obj.to_dict().get(“reasoning_content”)
except Exception: return None
def chat(messages, effort=None, thinking=True, tools=None, tool_choice=”auto”,
stream=False, max_tokens=2048, temperature=1.0, tool_stream=False):
“””
effort: None | “high” | “max” (GLM-5.2 thinking-effort level; max is the model default)
thinking: True -> deep thinking on; False -> off (fast, cheap, low-latency)
GLM-specific params go through extra_body so any OpenAI client works.
“””
extra = {“thinking”: {“type”: “enabled” if thinking else “disabled”}}
if effort and thinking: extra[“reasoning_effort”] = effort
if tool_stream: extra[“tool_stream”] = True
kwargs = dict(model=MODEL, messages=messages, max_tokens=max_tokens,
temperature=temperature, stream=stream, extra_body=extra)
if tools:
kwargs.update(tools=tools, tool_choice=tool_choice)
if stream:
kwargs[“stream_options”] = {“include_usage”: True}
return client.chat.completions.create(**kwargs)
