使用python脚本连接colab

import requests
import json

# ⚠️ 替换Cloudflare 网址,并在末尾加上 /completion
base_url = "https://YOUR-CLOUDFLARE-LINK-HERE.trycloudflare.com" 
api_url = f"{base_url}/completion"

payload = {
    "prompt": "Hello, are you working?",
    "n_predict": 50
}

try:
    print(f"Sending request to: {api_url}")
    response = requests.post(api_url, json=payload, timeout=30) # 设置30秒超时
    
    if response.status_code == 200:
        print("✅ 成功连接!服务器回复:")
        print(response.json().get('content'))
    else:
        print(f"❌ 连接通了,但服务器报错: {response.status_code}")
        print(response.text)
        
except Exception as e:
    print(f"❌ 彻底连不上: {e}")

局域网python调用

import requests
import json

url = "http://127.0.0.1:8081/completion"

headers = {"Content-Type": "application/json"}
data = {
    "prompt": "You are a helpful assistant.\nUser: Explain C++ pointers.\nAssistant:",
    "n_predict": 128,
    "temperature": 0.7
}

response = requests.post(url, headers=headers, json=data)
print(json.loads(response.text)['content'])

在colab中使用交互式

# 这是一个简易的 Python 包装器,调用底层的 C++
import subprocess

while True:
    user_input = input("User: ") # Colab 支持这个输入框
    if user_input.lower() in ["exit", "quit"]:
        break
        
    # 每次都重新调用 CLI (比较慢,因为要重新加载模型)
    # 注意:为了性能,这里其实不推荐,但这是在 Colab 单元格内使用 C++ 二进制的唯一“伪交互”法
    cmd = [
        "./bin/llama-cli",
        "-m", "../models/my-model-q4_k_m.gguf",
        "-p", f"User: {user_input}\nAssistant:",
        "-n", "128"
    ]
    # 运行并打印输出...

推荐使用 llama-server 方法。那不仅能体验流畅的对话,还能看到 llama.cpp 作为后端服务时的性能指标(Request/s)。

类似文章

发表回复