Runnable with vLLMtorch.cuda.empty_cache()
max_memory = {0: "22GiB", 1: "22GiB", "cpu": "160GiB"}
model_path = "meta-llama/Llama-3.3-70B-Instruct" quant_path = "ibnzterrell/Meta-Llama-3.3-70B-Instruct-AWQ-INT4" quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"
}
model = AutoAWQForCausalLM.from_pretrained( model_path, use_cache=False, max_memory=max_memory, device_map="cpu" )
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.quantize( tokenizer, quant_config=quant_config )
model.save_quantized(quant_path) tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')