NCBI数据库BioProject中的Description

生物信息学

Publish Date: 2024-06-16

Update Date: 2024-06-16

Word Count: 191

Read Times: 1 Min

Read Count:

import requests
from bs4 import BeautifulSoup

file_path = "D:/OneDrive/NAS/科研相关/PhData/data/生信挖掘/水稻多效基因/data/NCBI.BioProject.Rice.txt"

file_out = open("D:/OneDrive/NAS/科研相关/PhData/data/生信挖掘/水稻多效基因/data/NCBI.BioProject.Rice.description.txt", "w", encoding='utf-8')

with open(file_path, "r") as f:
    for line in f:
        # print(line.replace("\n", ""))

        # URL of the BioProject
        url = "https://www.ncbi.nlm.nih.gov/bioproject/" + line.replace("\n", "")

        # print(url)

        # Send a GET request to the webpage
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the page content
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Extract specific information
            # title = soup.find("div", id="DescrAll").get_text(strip=True)

            try:
                description = soup.find("div", id="DescrAll").get_text(strip=True).replace("\n", " ")
            except AttributeError:
                description = "None"
            except UnicodeEncodeError:
                continue
            
            # Print the extracted information
            # print(f"Title: {title}")
            # print(f"Description: {description}")

            file_out.write(line.replace("\n", "") + "\t" + description + "\n")

            print(line.replace("\n", "") + "\t" + description + "\n") 

            print("================================================")
        else:
            print(f"BioProject{line}: Failed to retrieve the webpage. Status code: {response.status_code}")