使用python脚本连接colab
import requests
import json
# ⚠️ 替换Cloudflare 网址,并在末尾加上 /completion
base_url = "https://YOUR-CLOUDFLARE-LINK-HERE.trycloudflare.com"
api_url = f"{base_url}/completion"
payload = {
"prompt": "Hello, are you working?",
"n_predict": 50
}
try:
print(f"Sending request to: {api_url}")
response = requests.post(api_url, json=payload, timeout=30) # 设置30秒超时
if response.status_code == 200:
print("✅ 成功连接!服务器回复:")
print(response.json().get('content'))
else:
print(f"❌ 连接通了,但服务器报错: {response.status_code}")
print(response.text)
except Exception as e:
print(f"❌ 彻底连不上: {e}")
局域网python调用
import requests
import json
url = "http://127.0.0.1:8081/completion"
headers = {"Content-Type": "application/json"}
data = {
"prompt": "You are a helpful assistant.\nUser: Explain C++ pointers.\nAssistant:",
"n_predict": 128,
"temperature": 0.7
}
response = requests.post(url, headers=headers, json=data)
print(json.loads(response.text)['content'])
在colab中使用交互式
# 这是一个简易的 Python 包装器,调用底层的 C++
import subprocess
while True:
user_input = input("User: ") # Colab 支持这个输入框
if user_input.lower() in ["exit", "quit"]:
break
# 每次都重新调用 CLI (比较慢,因为要重新加载模型)
# 注意:为了性能,这里其实不推荐,但这是在 Colab 单元格内使用 C++ 二进制的唯一“伪交互”法
cmd = [
"./bin/llama-cli",
"-m", "../models/my-model-q4_k_m.gguf",
"-p", f"User: {user_input}\nAssistant:",
"-n", "128"
]
# 运行并打印输出...
推荐使用 llama-server 方法。那不仅能体验流畅的对话,还能看到 llama.cpp 作为后端服务时的性能指标(Request/s)。