feat: 文章有效性校验，过滤不可开/字数<100的文章

新增 validate_article() 方法： - 调用 GET /news/{id} 接口验证文章是否存在（code≠0 则无效） - 去除 HTML 标签后统计正文字数，< 100 字则过滤 - 运行时缓存 _invalid_ids_cache：校验失败的 ID 进程内永久跳过，避免重复 API 调用静态黑名单更新： - 新增 1952296583257133058（测试发现的无效文章） - 静态黑名单与运行时缓存合并使用 get_news_list 流程： 1. 静态黑名单过滤（无 API 开销） 2. 热度+新鲜度加权采样 3. validate_article 逐篇校验 4. 若候选不足，从剩余池补充直到达到 count
2026-04-03 11:18:22 +08:00
parent f52bc7d147
commit e18c241bf0
1 changed files with 75 additions and 3 deletions
--- a/backend/app/services/news_service.py
+++ b/backend/app/services/news_service.py
@@ -29,6 +29,10 @@ class NewsPlatformService:
    async def _biz_url(self, db: AsyncSession) -> str:
        return await self._cfg(db, "news_platform_base_url", "http://192.168.1.200:63120")
    # Runtime cache of invalid article IDs (too short / not found / error)
    # Persists for the lifetime of the process — avoids repeated API calls
    _invalid_ids_cache: set = set()
    async def _auth_url(self, db: AsyncSession) -> str:
        return await self._cfg(db, "auth_base_url", "http://192.168.1.200:60040")
@@ -326,6 +330,52 @@ class NewsPlatformService:
            return False
    # ─── 新闻列表 ──────────────────────────────────────────────
    async def validate_article(self, db, user, article_id: str) -> bool:
        """
        验证文章是否可用：
        - 文章存在且接口返回 code=0
        - 去除 HTML 标签后正文字数 >= 100 字
        返回 True 表示可用，False 表示应跳过
        """
        if not article_id:
            return False
        # 命中缓存直接跳过
        if article_id in news_service._invalid_ids_cache:
            return False
        sess = await get_session(user.id)
        if not sess:
            return False
        biz = await self._biz_url(db)
        token = sess.get("token", "")
        try:
            async with httpx.AsyncClient(timeout=8) as c:
                r = await c.get(
                    f"{biz}/news/{article_id}",
                    headers=self._bearer(token),
                )
            if r.status_code != 200:
                news_service._invalid_ids_cache.add(article_id)
                return False
            d = r.json()
            if d.get("code") not in [0, 200]:
                logger.info(f"[文章校验] {article_id} 无效: code={d.get('code')} {d.get('message','')}")
                news_service._invalid_ids_cache.add(article_id)
                return False
            data = d.get("data") or {}
            # 取正文，去除 HTML 标签，统计字数
            raw = data.get("content") or data.get("digest") or data.get("newsTitle") or ""
            import re as _re
            text = _re.sub(r"<[^>]+>", "", raw).strip()
            if len(text) < 100:
                logger.info(f"[文章校验] {article_id} 正文过短({len(text)}字)，跳过")
                news_service._invalid_ids_cache.add(article_id)
                return False
            return True
        except Exception as e:
            logger.warning(f"[文章校验] {article_id} 请求异常: {e}")
            return False
    async def get_news_list(self, db, user, count=5, interest_tags=None) -> list:
        """
        GET /business/member/square/list  广场数据分页查询
@@ -398,11 +448,12 @@ class NewsPlatformService:
                    if platform_uid:
                        items = [x for x in items if x.get("createUser") != platform_uid]
                    # 过滤已知无效新闻（详情为空或不存在）
                    # 已知静态无效ID（直接过滤，无需 API 校验）
                    INVALID_IDS = {
                        "1965670408480907266","2029092495693975554","1960652956793597953",
                        "1960651987045347330","1960596408620838914","1960596083193180161",
-                        "1960595664341594113",
+                        "1960595664341594113","1952296583257133058",
-                    }
+                    } | news_service._invalid_ids_cache  # 合并运行时缓存
                    items = [x for x in items
                             if (x.get("recordId") or x.get("id")) not in INVALID_IDS]
                    logger.info(f"[广场新闻] {user.account} 获取到 {len(items)} 条（已过滤本人+无效文章）")
@@ -454,7 +505,28 @@ class NewsPlatformService:
                        i = pool.index(chosen_idx)
                        pool.pop(i)
                        w_pool.pop(i)
-                    return selected
+                    # ── 文章有效性校验（过滤不可开、字数<100的文章）─────
                    valid = []
                    for _a in selected:
                        _aid = str(_a.get("recordId") or _a.get("id", ""))
                        if await self.validate_article(db, user, _aid):
                            valid.append(_a)
                        # 若校验失败，尝试从候选池补充
                    # 若有效文章不够，从剩余候选中按权重补充
                    if len(valid) < count and len(pool) > 0:
                        remaining = [items[i] for i in pool]
                        _w2 = [weights[pool.index(i)] if i in pool else 1 for i in range(len(remaining))]
                        import random as _r2
                        _r2.shuffle(remaining)
                        for _a2 in remaining:
                            if len(valid) >= count:
                                break
                            _aid2 = str(_a2.get("recordId") or _a2.get("id", ""))
                            if await self.validate_article(db, user, _aid2):
                                valid.append(_a2)
                    if not valid:
                        logger.warning(f"[广场新闻] {user.account} 校验后无可用文章")
                    return valid
                logger.warning(f"[广场新闻] {user.account} code={d.get('code')} msg={d.get('message')}")
        except Exception as e:
            logger.error(f"[广场新闻] {user.account}: {e}")