Code Implementation of Qwen3.5 Refined Reasoning Models with Claude-Style Reasoning Using GGUF and 4-Bit Quantization

In this tutorial, we work directly with Qwen3.5 models populated with Claude logic and set up a Colab pipeline that allows us to switch between the 27B GGUF variant and the lightweight 2B 4-bit version with one flag. We start by verifying the availability of the GPU, then conditionally install llama.cpp or transformers with bizandbytes, depending on the chosen method. Both branches are combined using the generative_fn and stream_fn links, which ensures consistent predictions across all backends. We also use the ChatSession class to implement multiple sessions and create resources that will be defined
MODEL_PATH = "2B_HF"
import torch
if not torch.cuda.is_available():
raise RuntimeError(
"❌ No GPU! Go to Runtime → Change runtime type → T4 GPU."
)
gpu_name = torch.cuda.get_device_name(0)
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"✅ GPU: {gpu_name} — {vram_gb:.1f} GB VRAM")
import subprocess, sys, os, re, time
generate_fn = None
stream_fn = None
We start by setting the model path flag and checking if the GPU is available on the system. We retrieve and print the name of the GPU and the available VRAM to ensure that the environment meets the requirements. We also import all the basic libraries and define placeholders for the integrated production functions that will be provided later.
if MODEL_PATH == "27B_GGUF":
print("n📦 Installing llama-cpp-python with CUDA (takes 3-5 min)...")
env = os.environ.copy()
env["CMAKE_ARGS"] = "-DGGML_CUDA=on"
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "-q", "llama-cpp-python", "huggingface_hub"],
env=env,
)
print("✅ Installed.n")
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
GGUF_REPO = "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF"
GGUF_FILE = "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_M.gguf"
print(f"⏳ Downloading {GGUF_FILE} (~16.5 GB)... grab a coffee ☕")
model_path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
print(f"✅ Downloaded: {model_path}n")
print("⏳ Loading into llama.cpp (GPU offload)...")
llm = Llama(
model_path=model_path,
n_ctx=8192,
n_gpu_layers=40,
n_threads=4,
verbose=False,
)
print("✅ 27B GGUF model loaded!n")
def generate_fn(
prompt, system_prompt="You are a helpful assistant. Think step by step.",
max_new_tokens=2048, temperature=0.6, top_p=0.95, **kwargs
):
output = llm.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
)
return output["choices"][0]["message"]["content"]
def stream_fn(
prompt, system_prompt="You are a helpful assistant. Think step by step.",
max_new_tokens=2048, temperature=0.6, top_p=0.95,
):
print("⏳ Streaming output:n")
for chunk in llm.create_chat_completion(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
stream=True,
):
delta = chunk["choices"][0].get("delta", {})
text = delta.get("content", "")
if text:
print(text, end="", flush=True)
print()
class ChatSession:
def __init__(self, system_prompt="You are a helpful assistant. Think step by step."):
self.messages = [{"role": "system", "content": system_prompt}]
def chat(self, user_message, temperature=0.6):
self.messages.append({"role": "user", "content": user_message})
output = llm.create_chat_completion(
messages=self.messages, max_tokens=2048,
temperature=temperature, top_p=0.95,
)
resp = output["choices"][0]["message"]["content"]
self.messages.append({"role": "assistant", "content": resp})
return resp
We manage the 27B GGUF method by installing llama.cpp with CUDA support and downloading the refined Qwen3.5 27B model from Hugging Face. We load the model with GPU loading and define standard generic_fn and stream_fn for the input and output of the stream. We also use the ChatSession class to store chat history for various interactions.
elif MODEL_PATH == "2B_HF":
print("n📦 Installing transformers + bitsandbytes...")
subprocess.check_call([
sys.executable, "-m", "pip", "install", "-q",
"transformers @ git+
"accelerate", "bitsandbytes", "sentencepiece", "protobuf",
])
print("✅ Installed.n")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
HF_MODEL_ID = "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
print(f"⏳ Loading {HF_MODEL_ID} in 4-bit...")
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
HF_MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
print(f"✅ Model loaded! Memory: {model.get_memory_footprint() / 1e9:.2f} GBn")
def generate_fn(
prompt, system_prompt="You are a helpful assistant. Think step by step.",
max_new_tokens=2048, temperature=0.6, top_p=0.95,
repetition_penalty=1.05, do_sample=True, **kwargs
):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
output_ids = model.generate(
**inputs, max_new_tokens=max_new_tokens, temperature=temperature,
top_p=top_p, repetition_penalty=repetition_penalty, do_sample=do_sample,
)
generated = output_ids[0][inputs["input_ids"].shape[1]:]
return tokenizer.decode(generated, skip_special_tokens=True)
def stream_fn(
prompt, system_prompt="You are a helpful assistant. Think step by step.",
max_new_tokens=2048, temperature=0.6, top_p=0.95,
):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
print("⏳ Streaming output:n")
with torch.no_grad():
model.generate(
**inputs, max_new_tokens=max_new_tokens, temperature=temperature,
top_p=top_p, do_sample=True, streamer=streamer,
)
class ChatSession:
def __init__(self, system_prompt="You are a helpful assistant. Think step by step."):
self.messages = [{"role": "system", "content": system_prompt}]
def chat(self, user_message, temperature=0.6):
self.messages.append({"role": "user", "content": user_message})
text = tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
output_ids = model.generate(
**inputs, max_new_tokens=2048, temperature=temperature, top_p=0.95, do_sample=True,
)
generated = output_ids[0][inputs["input_ids"].shape[1]:]
resp = tokenizer.decode(generated, skip_special_tokens=True)
self.messages.append({"role": "assistant", "content": resp})
return resp
else:
raise ValueError("MODEL_PATH must be '27B_GGUF' or '2B_HF'")
We use a lightweight 2B method using transformers with 4-bit quantization by bizandbytes. We successfully load the Qwen3.5 2B melt model on the GPU and configure the parameters for controlled sample generation. We also define integrated generation, streaming, and chat session logic so that both model methods behave in the same way at runtime.
def parse_thinking(response: str) -> tuple:
m = re.search(r"(.*?) ", response, re.DOTALL)
if m:
return m.group(1).strip(), response[m.end():].strip()
return "", response.strip()
def display_response(response: str):
thinking, answer = parse_thinking(response)
if thinking:
print("🧠 THINKING:")
print("-" * 60)
print(thinking[:1500] + ("n... [truncated]" if len(thinking) > 1500 else ""))
print("-" * 60)
print("n💬 ANSWER:")
print(answer)
print("✅ All helpers ready. Running tests...n")
We define the functions of the helper in order to release the closed-loop thinking sequence
print("=" * 70)
print("📝 TEST 1: Basic reasoning")
print("=" * 70)
response = generate_fn(
"If I have 3 apples and give away half, then buy 5 more, how many do I have? "
"Explain your reasoning."
)
display_response(response)
print("n" + "=" * 70)
print("📝 TEST 2: Streaming output")
print("=" * 70)
stream_fn(
"Explain the difference between concurrency and parallelism. "
"Give a real-world analogy for each."
)
print("n" + "=" * 70)
print("📝 TEST 3: Thinking ON vs OFF")
print("=" * 70)
question = "What is the capital of France?"
print("n--- Thinking ON (default) ---")
resp = generate_fn(question)
display_response(resp)
print("n--- Thinking OFF (concise) ---")
resp = generate_fn(
question,
system_prompt="Answer directly and concisely. Do not use tags.",
max_new_tokens=256,
)
display_response(resp)
print("n" + "=" * 70)
print("📝 TEST 4: Bat & ball trick question")
print("=" * 70)
response = generate_fn(
"A bat and a ball cost $1.10 in total. "
"How much does the ball cost? Show complete reasoning and verify.",
system_prompt="You are a precise mathematical reasoner. Set up equations and verify.",
temperature=0.3,
)
display_response(response)
print("n" + "=" * 70)
print("📝 TEST 5: Train meeting problem")
print("=" * 70)
response = generate_fn(
"A train leaves Station A at 9:00 AM at 60 mph toward Station B. "
"Another leaves Station B at 10:00 AM at 80 mph toward Station A. "
"Stations are 280 miles apart. When and where do they meet?",
temperature=0.3,
)
display_response(response)
print("n" + "=" * 70)
print("📝 TEST 6: Logic puzzle (five houses)")
print("=" * 70)
response = generate_fn(
"Five houses in a row are painted different colors. "
"The red house is left of the blue house. "
"The green house is in the middle. "
"The yellow house is not next to the blue house. "
"The white house is at one end. "
"What is the order from left to right?",
temperature=0.3,
max_new_tokens=3000,
)
display_response(response)
print("n" + "=" * 70)
print("📝 TEST 7: Code generation — longest palindromic substring")
print("=" * 70)
response = generate_fn(
"Write a Python function to find the longest palindromic substring "
"using Manacher's algorithm. Include docstring, type hints, and tests.",
system_prompt="You are an expert Python programmer. Think through the algorithm carefully.",
max_new_tokens=3000,
temperature=0.3,
)
display_response(response)
print("n" + "=" * 70)
print("📝 TEST 8: Multi-turn conversation (physics tutor)")
print("=" * 70)
session = ChatSession(
system_prompt="You are a knowledgeable physics tutor. Explain clearly with examples."
)
turns = [
"What is the Heisenberg uncertainty principle?",
"Can you give me a concrete example with actual numbers?",
"How does this relate to quantum tunneling?",
]
for i, q in enumerate(turns, 1):
print(f"n{'─'*60}")
print(f"👤 Turn {i}: {q}")
print(f"{'─'*60}")
resp = session.chat(q, temperature=0.5)
_, answer = parse_thinking(resp)
print(f"🤖 {answer[:1000]}{'...' if len(answer) > 1000 else ''}")
print("n" + "=" * 70)
print("📝 TEST 9: Temperature comparison — creative writing")
print("=" * 70)
creative_prompt = "Write a one-paragraph opening for a sci-fi story about AI consciousness."
configs = [
{"label": "Low temp (0.1)", "temperature": 0.1, "top_p": 0.9},
{"label": "Med temp (0.6)", "temperature": 0.6, "top_p": 0.95},
{"label": "High temp (1.0)", "temperature": 1.0, "top_p": 0.98},
]
for cfg in configs:
print(f"n🎛️ {cfg['label']}")
print("-" * 60)
start = time.time()
resp = generate_fn(
creative_prompt,
system_prompt="You are a creative fiction writer.",
max_new_tokens=512,
temperature=cfg["temperature"],
top_p=cfg["top_p"],
)
elapsed = time.time() - start
_, answer = parse_thinking(resp)
print(answer[:600])
print(f"⏱️ {elapsed:.1f}s")
print("n" + "=" * 70)
print("📝 TEST 10: Speed benchmark")
print("=" * 70)
start = time.time()
resp = generate_fn(
"Explain how a neural network learns, step by step, for a beginner.",
system_prompt="You are a patient, clear teacher.",
max_new_tokens=1024,
)
elapsed = time.time() - start
approx_tokens = int(len(resp.split()) * 1.3)
print(f"~{approx_tokens} tokens in {elapsed:.1f}s")
print(f"~{approx_tokens / elapsed:.1f} tokens/sec")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Peak VRAM: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
import gc
for name in ["model", "llm"]:
if name in globals():
del globals()[name]
gc.collect()
torch.cuda.empty_cache()
print(f"n✅ Memory freed. VRAM: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print("n" + "=" * 70)
print("🎉 Tutorial complete!")
print("=" * 70)
We use an extensive testing program that tests the model across reasoning, streaming, logic puzzles, code generation, and dynamic conversations. We compare the output under different temperature settings and measure the performance in terms of speed and token output. Finally, we clean up memory and free GPU resources, ensuring that the notebook is always reusable for further testing.
In conclusion, we have a compact yet flexible setup for using Qwen3.5-based logic models optimized with Claude’s style of manipulation for all hardware constraints. The text summarizes background differences while revealing consistent production, streaming, and dialogue, making it easier to explore cognitive behavior. With the test suite, we investigate how well the model handles structured reasoning, complex queries, and long multi-step tasks, while measuring speed and memory usage. What we end up with is not just a demo, but a reusable scaffold for testing and extending Qwen-based logic systems in Colab without changing the core code.
Check it out The Complete Book again Source Page. Also, feel free to follow us Twitter and don’t forget to join our 120k+ ML SubReddit and Subscribe to Our newspaper. Wait! are you on telegram? now you can join us on telegram too.



