Havoc412 72ba8efb25 refactor(model_es): 优化 Encounter.TopK 函数并添加注释
- 在 Encounter.TopK 函数中添加了处理 ES 中 embedding 为 null 的情况的注释
- 调整了 body 字符串的格式,提高了可读性
- 移除了 web.go 中的多余导入和注释
2024-11-24 00:43:21 +08:00

228 lines
5.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package model_es
import (
"bytes"
"catface/app/global/consts"
"catface/app/global/variable"
"catface/app/model"
"catface/app/utils/data_bind"
"catface/app/utils/model_handler"
"context"
"encoding/json"
"fmt"
"github.com/elastic/go-elasticsearch/v8"
"github.com/elastic/go-elasticsearch/v8/esapi"
)
func CreateEncounterESFactory(encounter *model.Encounter) *Encounter {
if encounter == nil { // UPDATE 这样写好丑。
return &Encounter{}
}
// 我把数值绑定到了工厂创建当中。
return &Encounter{
Id: encounter.Id,
Title: encounter.Title,
Content: encounter.Content,
Tags: encounter.TagsList, // TODO 暂时没有对此字段的查询。
}
}
// INFO 存储能够作为索引存在的数据。
type Encounter struct {
Id int64 `json:"id"`
Title string `json:"title" explain:"路遇笔记标题"`
Content string `json:"content" explain:"内容"`
Tags []string `json:"tags" explain:"标签"`
Embedding []float64 `json:"embedding"`
TagsHighlight []string `json:"-" bind:"tags_highlight"`
}
func (e *Encounter) IndexName() string {
return "catface_encounters"
}
func (e *Encounter) InsertDocument() error {
ctx := context.Background()
// 将结构体转换为 JSON 字符串
data, err := json.Marshal(e)
if err != nil {
return err
}
// 创建请求
req := esapi.IndexRequest{
Index: e.IndexName(),
DocumentID: fmt.Sprintf("%d", e.Id),
Body: bytes.NewReader(data),
Refresh: "true",
}
// 发送请求
res, err := req.Do(ctx, variable.ElasticClient)
if err != nil {
return err
}
defer res.Body.Close()
if res.IsError() {
var e map[string]interface{}
if err := json.NewDecoder(res.Body).Decode(&e); err != nil {
return fmt.Errorf("error parsing the response body: %s", err)
} else {
return fmt.Errorf("[%s] %s: %s",
res.Status(),
e["error"].(map[string]interface{})["type"],
e["error"].(map[string]interface{})["reason"],
)
}
}
return nil
}
// TODO 改正,仿 Insert
func (e *Encounter) UpdateDocument(client *elasticsearch.Client, encounter *Encounter) error {
ctx := context.Background()
// 将结构体转换为 JSON 字符串
data, err := json.Marshal(map[string]interface{}{
"doc": encounter,
})
if err != nil {
return err
}
// 创建请求
req := esapi.UpdateRequest{
Index: encounter.IndexName(),
DocumentID: fmt.Sprintf("%d", encounter.Id),
Body: bytes.NewReader(data),
Refresh: "true",
}
// 发送请求
res, err := req.Do(ctx, client)
if err != nil {
return err
}
defer res.Body.Close()
if res.IsError() {
var e map[string]interface{}
if err := json.NewDecoder(res.Body).Decode(&e); err != nil {
return fmt.Errorf("error parsing the response body: %s", err)
} else {
return fmt.Errorf("[%s] %s: %s",
res.Status(),
e["error"].(map[string]interface{})["type"],
e["error"].(map[string]interface{})["reason"],
)
}
}
return nil
}
/**
* @description: 粗略地包含各种关键词匹配,
* @param {*elasticsearch.Client} client
* @param {string} query
* @return {*} 对应 Encounter 的 id然后交给 MySQL 来查询详细的信息?
*/
func (e *Encounter) QueryDocumentsMatchAll(query string, num int) ([]Encounter, error) {
body := fmt.Sprintf(`{
"size": %d,
"query": {
"bool": {
"should": [
{"match": {"tags": "%s"}},
{"match": {"content": "%s"}},
{"match": {"title": "%s"}}
]
}
},
"highlight": {
"pre_tags": ["%v"],
"post_tags": ["%v"],
"fields": {
"title": {},
"content": {
"fragment_size" : 15
},
"tags": {
"pre_tags": [""],
"post_tags": [""]
}
}
},
"_source": ["id", "title", "content", "tags"]
}`, num, query, query, query, consts.PreTags, consts.PostTags)
hits, err := model_handler.SearchRequest(body, e.IndexName())
if err != nil {
return nil, err
}
var encounters []Encounter
for _, hit := range hits {
data := model_handler.MergeSouceWithHighlight(hit.(map[string]interface{}))
var encounter Encounter
if err := data_bind.ShouldBindFormMapToModel(data, &encounter); err != nil {
continue
}
encounters = append(encounters, encounter)
}
return encounters, nil
}
func (e *Encounter) TopK(embedding []float64, k int) ([]Encounter, error) {
// 同理 Doc
params := map[string]interface{}{
"query_vector": embedding,
}
paramsJSON, err := json.Marshal(params)
if err != nil {
return nil, err
}
// UPDATE 这里无法处理 ES 中 embedding 为 null 的情况。
body := fmt.Sprintf(`{
"size": %d,
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
"params": %s
}
}
},
"_source": ["id"]
}`, k, string(paramsJSON))
hits, err := model_handler.SearchRequest(body, e.IndexName())
if err != nil {
return nil, err
}
var encounters []Encounter
for _, hit := range hits {
hitMap := hit.(map[string]interface{})
source := hitMap["_source"].(map[string]interface{})
var encounter Encounter
if err := data_bind.ShouldBindFormMapToModel(source, &encounter); err != nil {
continue
}
encounters = append(encounters, encounter)
}
return encounters, nil
}