PostgreSQL向量数据库支持:修订间差异

来自linuxsa wiki
跳转到导航 跳转到搜索
Evan留言 | 贡献
Evan留言 | 贡献
第424行: 第424行:


</pre>
</pre>
== 永久==
<pre>
CREATE EXTENSION IF NOT EXISTS vector;
👉 在 RDS 里扩展是 数据库级别 的(不是整个实例),也就是说你需要在每个数据库里执行一次。
永久启用(自动化方式)
RDS 没法像自建 PostgreSQL 那样直接改 postgresql.conf 或 shared_preload_libraries,所以“永久”开启通常要用以下方法:
方法 A:在模板数据库(template1)中启用
\c template1
CREATE EXTENSION IF NOT EXISTS vector;
这样之后所有 新建数据库 都会自动带上 pgvector。
方法 B:使用 init SQL 自动化(推荐)
你可以在 Terraform/CloudFormation 或 AWS CLI 创建数据库时执行 SQL 脚本,让数据库初始化时就启用扩展。
例如 Terraform:
resource "aws_rds_cluster" "example" {
  # ...
  database_name = "mydb"
  # 在 DB 初始化后用 aws_rds_cluster_parameter_group 或 Lambda 自动执行 SQL
}
</pre>
[[category:devops]]
[[category:devops]]

2025年8月21日 (四) 15:18的版本

起因 AI so hit

LLM version 13.18

PostgreSQL

install

https://www.postgresql.org/download/linux/debian/

PostgreSQL 内置支持

从 PostgreSQL 16 开始,内置了对向量操作的支持:

   新增 vector数据类型
    支持基本的向量运算

pgvector 扩展

pgvector 是 PostgreSQL 最流行的向量搜索扩展,它允许:

Amazon RDS supported extensions

on u24.04

Ubuntu 24.04


sudo apt install -y curl gnupg lsb-release
 2002  curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
 2003  echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
 2004  sudo apt update

 2006  psgl --version
 2007  sudo apt install -y postgresql-13 postgresql-client-13 postgresql-server-dev-13

 2010  psql --version
 2011  sudo systemctl start postgresql
 2012  sudo apt install -y git make gcc

 2015  git clone https://github.com/pgvector/pgvector.git
 2016  cd pgvector/
 2018  git  checkout v0.7.0
  2020  make 
 2022  make install 
 2023  sudo -i -u postgres 
 2024  sudo systemctl start postgresql
 2025  sudo -i -u postgres
 
 

postgres@racknerd-0955f20:~$ sudo systemctl start postgresql
[sudo] password for postgres: 
sudo: a password is required
postgres@racknerd-0955f20:~$ 
logout
root@racknerd-0955f20:~/pgvector# sudo systemctl start postgresql
root@racknerd-0955f20:~/pgvector# sudo -i -u postgres
postgres@racknerd-0955f20:~$ psql
psql (13.22 (Ubuntu 13.22-1.pgdg22.04+1))
Type "help" for help.

postgres=# CREATE DATABASE testdb;
ERROR:  database "testdb" already exists
postgres=# \c testdb
You are now connected to database "testdb" as user "postgres".
testdb=# CREATE EXTENSION vector;
CREATE EXTENSION
testdb=# CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3));
CREATE TABLE
testdb=# INSERT INTO items (embedding) VALUES ('[1,2,3]');
INSERT 0 1
testdb=# SELECT * FROM items;
 id | embedding 
----+-----------
  1 | [1,2,3]
(1 row)

testdb=# 




Amazon RDS for PostgreSQL - AWS pgvector

tf create pg(used)

cat  main.tf
provider "aws" {
  region = "ap-east-1"
}

# 1. 创建 VPC
resource "aws_vpc" "rds_vpc" {
  cidr_block           = "10.0.0.0/16"
  enable_dns_support   = true
  enable_dns_hostnames = true

  tags = {
    Name = "rds-vpc"
  }
}

# 2. 创建两个子网(RDS 至少需要2个子网)
resource "aws_subnet" "rds_subnet_1" {
  vpc_id                  = aws_vpc.rds_vpc.id
  cidr_block              = "10.0.1.0/24"
  availability_zone       = "ap-east-1a"
  map_public_ip_on_launch = true

  tags = {
    Name = "rds-subnet-1"
  }
}

resource "aws_subnet" "rds_subnet_2" {
  vpc_id                  = aws_vpc.rds_vpc.id
  cidr_block              = "10.0.2.0/24"
  availability_zone       = "ap-east-1b"
  map_public_ip_on_launch = true

  tags = {
    Name = "rds-subnet-2"
  }
}

# 3. 创建 Internet Gateway(让数据库可公网访问)
resource "aws_internet_gateway" "rds_igw" {
  vpc_id = aws_vpc.rds_vpc.id

  tags = {
    Name = "rds-igw"
  }
}

# 4. 创建 Route Table 并关联子网
resource "aws_route_table" "rds_rt" {
  vpc_id = aws_vpc.rds_vpc.id

  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.rds_igw.id
  }

  tags = {
    Name = "rds-route-table"
  }
}

resource "aws_route_table_association" "rds_rta_1" {
  subnet_id      = aws_subnet.rds_subnet_1.id
  route_table_id = aws_route_table.rds_rt.id
}

resource "aws_route_table_association" "rds_rta_2" {
  subnet_id      = aws_subnet.rds_subnet_2.id
  route_table_id = aws_route_table.rds_rt.id
}

# 5. 安全组,允许本地访问 PostgreSQL (5432)
resource "aws_security_group" "rds_sg" {
  name        = "rds-postgres-sg"
  description = "Allow PostgreSQL inbound traffic"
  vpc_id      = aws_vpc.rds_vpc.id

  ingress {
    description = "PostgreSQL from my IP"
    from_port   = 5432
    to_port     = 5432
    protocol    = "tcp"
    cidr_blocks = ["107.172.217.42/32"] # ⚠️ 改成你本机公网IP
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = {
    Name = "rds-sg"
  }
}

# 6. RDS Subnet Group
resource "aws_db_subnet_group" "rds_subnet_group" {
  name       = "rds-subnet-group"
  subnet_ids = [aws_subnet.rds_subnet_1.id, aws_subnet.rds_subnet_2.id]

  tags = {
    Name = "rds-subnet-group"
  }
}

# 7. 创建 RDS PostgreSQL 13.18
resource "aws_db_instance" "postgres" {
  identifier        = "my-postgres-db"
  allocated_storage = 20
  engine            = "postgres"
  engine_version    = "13.18"
  instance_class    = "db.t3.micro"
  username          = "myadmin"
  password          = "YourPassword1234"  # ⚠️ 建议改用 SSM/Secrets Manager
  db_subnet_group_name   = aws_db_subnet_group.rds_subnet_group.name
  vpc_security_group_ids = [aws_security_group.rds_sg.id]

  publicly_accessible = true
  skip_final_snapshot = true

  tags = {
    Name = "rds-postgres-13-18"
  }
}

####################
# 输出信息
####################
output "rds_endpoint" {
  value = aws_db_instance.postgres.endpoint
}
root@racknerd-0955f20:~/tfpg# 


<pre/>

==check==
<pre>
f20:~/tfpg# psql -h  my-postgres-db.chysey0o0hmr.ap-east-1.rds.amazonaws.com -U myadmin -d postgres 
Password for user myadmin: 
psql (13.22 (Ubuntu 13.22-1.pgdg22.04+1), server 13.18)
SSL connection (protocol: TLSv1.2, cipher: ECDHE-RSA-AES256-GCM-SHA384, bits: 256, compression: off)
Type "help" for help.

postgres=> CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION
postgres=> CREATE TABLE items (
  id bigserial PRIMARY KEY,
  embedding vector(3)  -- 三维向量
);
CREATE TABLE
postgres=> INSERT INTO items (embedding) VALUES ('[1,2,3]'), ('[4,5,6]');
INSERT 0 2
postgres=> -- 欧氏距离最近
SELECT * FROM items ORDER BY embedding <-> '[3,1,2]' LIMIT 1;
 id | embedding 
----+-----------
  1 | [1,2,3]
(1 row)

postgres=> 


#check 

postgres=> \dx
                             List of installed extensions
  Name   | Version |   Schema   |                     Description                      
---------+---------+------------+------------------------------------------------------
 plpgsql | 1.0     | pg_catalog | PL/pgSQL procedural language
 vector  | 0.8.0   | public     | vector data type and ivfflat and hnsw access methods
(2 rows)


** check 
-- 1. 创建一个带向量字段的表
CREATE TABLE items (
  id bigserial PRIMARY KEY,
  embedding vector(3)   -- 三维向量
);

-- 2. 插入几条数据
INSERT INTO items (embedding) VALUES
  ('[1,2,3]'),
  ('[4,5,6]'),
  ('[1,1,1]');

-- 3. 做一次向量相似度查询
-- 用 [1,2,3] 去查最相似的 embedding
SELECT id, embedding, embedding <-> '[1,2,3]' AS distance
FROM items
ORDER BY embedding <-> '[1,2,3]'
LIMIT 3;

fine 
 id | embedding |     distance     
----+-----------+------------------
  1 | [1,2,3]   |                0
  3 | [1,2,3]   |                0
  5 | [1,1,1]   | 2.23606797749979
(3 rows)




the other one


provider "aws" {
  region = "ap-east-1"
}

####################
# VPC & Subnets
####################
resource "aws_vpc" "main" {
  cidr_block           = "10.0.0.0/16"
  enable_dns_support   = true
  enable_dns_hostnames = true
  tags = {
    Name = "rds-vpc"
  }
}

resource "aws_internet_gateway" "gw" {
  vpc_id = aws_vpc.main.id
}

resource "aws_subnet" "public_a" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = "10.0.1.0/24"
  availability_zone       = "ap-east-1a"
  map_public_ip_on_launch = true
  tags = {
    Name = "public-subnet-a"
  }
}

resource "aws_subnet" "public_b" {
  vpc_id                  = aws_vpc.main.id
  cidr_block              = "10.0.2.0/24"
  availability_zone       = "ap-east-1b"
  map_public_ip_on_launch = true
  tags = {
    Name = "public-subnet-b"
  }
}

resource "aws_route_table" "public" {
  vpc_id = aws_vpc.main.id

  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.gw.id
  }
}

resource "aws_route_table_association" "a" {
  subnet_id      = aws_subnet.public_a.id
  route_table_id = aws_route_table.public.id
}

resource "aws_route_table_association" "b" {
  subnet_id      = aws_subnet.public_b.id
  route_table_id = aws_route_table.public.id
}

####################
# Security Group
####################
resource "aws_security_group" "rds_sg" {
  name        = "rds-sg"
  description = "Allow PostgreSQL inbound"
  vpc_id      = aws_vpc.main.id

  ingress {
    from_port   = 5432
    to_port     = 5432
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"] # ⚠️ 测试时允许所有,生产请限制IP
  }

  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = {
    Name = "rds-sg"
  }
}

####################
# Subnet Group for RDS
####################
resource "aws_db_subnet_group" "rds_subnet_group" {
  name       = "rds-subnet-group"
  subnet_ids = [aws_subnet.public_a.id, aws_subnet.public_b.id]

  tags = {
    Name = "rds-subnet-group"
  }
}

####################
# RDS PostgreSQL
####################
resource "aws_db_instance" "postgres" {
  identifier              = "my-postgres-db"
  engine                  = "postgres"
  engine_version          = "13.18"
  instance_class          = "db.t3.micro" # 可调整
  allocated_storage       = 20
  username                = "admin"
  password                = "Password123!" # ⚠️ 生产请使用 secret manager
  db_subnet_group_name    = aws_db_subnet_group.rds_subnet_group.name
  vpc_security_group_ids  = [aws_security_group.rds_sg.id]
  publicly_accessible     = true
  skip_final_snapshot     = true

  tags = {
    Name = "my-postgres-db"
  }
}

####################
# 输出信息
####################
output "rds_endpoint" {
  value = aws_db_instance.postgres.endpoint
}

永久

CREATE EXTENSION IF NOT EXISTS vector;

👉 在 RDS 里扩展是 数据库级别 的(不是整个实例),也就是说你需要在每个数据库里执行一次。


永久启用(自动化方式)

RDS 没法像自建 PostgreSQL 那样直接改 postgresql.conf 或 shared_preload_libraries,所以“永久”开启通常要用以下方法:

方法 A:在模板数据库(template1)中启用

\c template1
CREATE EXTENSION IF NOT EXISTS vector;

这样之后所有 新建数据库 都会自动带上 pgvector。


方法 B:使用 init SQL 自动化(推荐)

你可以在 Terraform/CloudFormation 或 AWS CLI 创建数据库时执行 SQL 脚本,让数据库初始化时就启用扩展。

例如 Terraform:

resource "aws_rds_cluster" "example" {
  # ...
  database_name = "mydb"
  # 在 DB 初始化后用 aws_rds_cluster_parameter_group 或 Lambda 自动执行 SQL
}