feat: 文章有效性校验,过滤不可开/字数<100的文章
新增 validate_article() 方法:
- 调用 GET /news/{id} 接口验证文章是否存在(code≠0 则无效)
- 去除 HTML 标签后统计正文字数,< 100 字则过滤
- 运行时缓存 _invalid_ids_cache:校验失败的 ID 进程内永久跳过,避免重复 API 调用
静态黑名单更新:
- 新增 1952296583257133058(测试发现的无效文章)
- 静态黑名单与运行时缓存合并使用
get_news_list 流程:
1. 静态黑名单过滤(无 API 开销)
2. 热度+新鲜度加权采样
3. validate_article 逐篇校验
4. 若候选不足,从剩余池补充直到达到 count
This commit is contained in:
@@ -29,6 +29,10 @@ class NewsPlatformService:
|
||||
async def _biz_url(self, db: AsyncSession) -> str:
|
||||
return await self._cfg(db, "news_platform_base_url", "http://192.168.1.200:63120")
|
||||
|
||||
# Runtime cache of invalid article IDs (too short / not found / error)
|
||||
# Persists for the lifetime of the process — avoids repeated API calls
|
||||
_invalid_ids_cache: set = set()
|
||||
|
||||
async def _auth_url(self, db: AsyncSession) -> str:
|
||||
return await self._cfg(db, "auth_base_url", "http://192.168.1.200:60040")
|
||||
|
||||
@@ -326,6 +330,52 @@ class NewsPlatformService:
|
||||
return False
|
||||
|
||||
# ─── 新闻列表 ──────────────────────────────────────────────
|
||||
async def validate_article(self, db, user, article_id: str) -> bool:
|
||||
"""
|
||||
验证文章是否可用:
|
||||
- 文章存在且接口返回 code=0
|
||||
- 去除 HTML 标签后正文字数 >= 100 字
|
||||
返回 True 表示可用,False 表示应跳过
|
||||
"""
|
||||
if not article_id:
|
||||
return False
|
||||
# 命中缓存直接跳过
|
||||
if article_id in news_service._invalid_ids_cache:
|
||||
return False
|
||||
|
||||
sess = await get_session(user.id)
|
||||
if not sess:
|
||||
return False
|
||||
biz = await self._biz_url(db)
|
||||
token = sess.get("token", "")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=8) as c:
|
||||
r = await c.get(
|
||||
f"{biz}/news/{article_id}",
|
||||
headers=self._bearer(token),
|
||||
)
|
||||
if r.status_code != 200:
|
||||
news_service._invalid_ids_cache.add(article_id)
|
||||
return False
|
||||
d = r.json()
|
||||
if d.get("code") not in [0, 200]:
|
||||
logger.info(f"[文章校验] {article_id} 无效: code={d.get('code')} {d.get('message','')}")
|
||||
news_service._invalid_ids_cache.add(article_id)
|
||||
return False
|
||||
data = d.get("data") or {}
|
||||
# 取正文,去除 HTML 标签,统计字数
|
||||
raw = data.get("content") or data.get("digest") or data.get("newsTitle") or ""
|
||||
import re as _re
|
||||
text = _re.sub(r"<[^>]+>", "", raw).strip()
|
||||
if len(text) < 100:
|
||||
logger.info(f"[文章校验] {article_id} 正文过短({len(text)}字),跳过")
|
||||
news_service._invalid_ids_cache.add(article_id)
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"[文章校验] {article_id} 请求异常: {e}")
|
||||
return False
|
||||
|
||||
async def get_news_list(self, db, user, count=5, interest_tags=None) -> list:
|
||||
"""
|
||||
GET /business/member/square/list 广场数据分页查询
|
||||
@@ -398,11 +448,12 @@ class NewsPlatformService:
|
||||
if platform_uid:
|
||||
items = [x for x in items if x.get("createUser") != platform_uid]
|
||||
# 过滤已知无效新闻(详情为空或不存在)
|
||||
# 已知静态无效ID(直接过滤,无需 API 校验)
|
||||
INVALID_IDS = {
|
||||
"1965670408480907266","2029092495693975554","1960652956793597953",
|
||||
"1960651987045347330","1960596408620838914","1960596083193180161",
|
||||
"1960595664341594113",
|
||||
}
|
||||
"1960595664341594113","1952296583257133058",
|
||||
} | news_service._invalid_ids_cache # 合并运行时缓存
|
||||
items = [x for x in items
|
||||
if (x.get("recordId") or x.get("id")) not in INVALID_IDS]
|
||||
logger.info(f"[广场新闻] {user.account} 获取到 {len(items)} 条(已过滤本人+无效文章)")
|
||||
@@ -454,7 +505,28 @@ class NewsPlatformService:
|
||||
i = pool.index(chosen_idx)
|
||||
pool.pop(i)
|
||||
w_pool.pop(i)
|
||||
return selected
|
||||
# ── 文章有效性校验(过滤不可开、字数<100的文章)─────
|
||||
valid = []
|
||||
for _a in selected:
|
||||
_aid = str(_a.get("recordId") or _a.get("id", ""))
|
||||
if await self.validate_article(db, user, _aid):
|
||||
valid.append(_a)
|
||||
# 若校验失败,尝试从候选池补充
|
||||
# 若有效文章不够,从剩余候选中按权重补充
|
||||
if len(valid) < count and len(pool) > 0:
|
||||
remaining = [items[i] for i in pool]
|
||||
_w2 = [weights[pool.index(i)] if i in pool else 1 for i in range(len(remaining))]
|
||||
import random as _r2
|
||||
_r2.shuffle(remaining)
|
||||
for _a2 in remaining:
|
||||
if len(valid) >= count:
|
||||
break
|
||||
_aid2 = str(_a2.get("recordId") or _a2.get("id", ""))
|
||||
if await self.validate_article(db, user, _aid2):
|
||||
valid.append(_a2)
|
||||
if not valid:
|
||||
logger.warning(f"[广场新闻] {user.account} 校验后无可用文章")
|
||||
return valid
|
||||
logger.warning(f"[广场新闻] {user.account} code={d.get('code')} msg={d.get('message')}")
|
||||
except Exception as e:
|
||||
logger.error(f"[广场新闻] {user.account}: {e}")
|
||||
|
||||
Reference in New Issue
Block a user